howard.objects.variants

    1import csv
    2import gc
    3import gzip
    4import io
    5import multiprocessing
    6import os
    7import random
    8import re
    9import shlex
   10import sqlite3
   11import subprocess
   12from tempfile import NamedTemporaryFile, TemporaryDirectory
   13import tempfile
   14import duckdb
   15import json
   16import yaml
   17import argparse
   18import Bio.bgzf as bgzf
   19import pandas as pd
   20from pyfaidx import Fasta
   21import numpy as np
   22import vcf
   23import logging as log
   24import fastparquet as fp
   25from multiprocesspandas import applyparallel
   26import cyvcf2
   27import pyBigWig
   28
   29from howard.functions.commons import *
   30from howard.objects.database import *
   31from howard.functions.databases import *
   32from howard.functions.utils import *
   33
   34
   35class Variants:
   36
   37    def __init__(
   38        self,
   39        conn=None,
   40        input: str = None,
   41        output: str = None,
   42        config: dict = {},
   43        param: dict = {},
   44        load: bool = False,
   45    ) -> None:
   46        """
   47        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
   48        header
   49
   50        :param conn: the connection to the database
   51        :param input: the input file
   52        :param output: the output file
   53        :param config: a dictionary containing the configuration of the model
   54        :param param: a dictionary containing the parameters of the model
   55        """
   56
   57        # Init variables
   58        self.init_variables()
   59
   60        # Input
   61        self.set_input(input)
   62
   63        # Config
   64        self.set_config(config)
   65
   66        # Param
   67        self.set_param(param)
   68
   69        # Output
   70        self.set_output(output)
   71
   72        # connexion
   73        self.set_connexion(conn)
   74
   75        # Header
   76        self.set_header()
   77
   78        # Samples
   79        self.set_samples()
   80
   81        # Load data
   82        if load:
   83            self.load_data()
   84
   85    def set_samples(self, samples: list = None) -> list:
   86        """
   87        The function `set_samples` sets the samples attribute of an object to a provided list or
   88        retrieves it from a parameter dictionary.
   89
   90        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
   91        input and sets the `samples` attribute of the class to the provided list. If no samples are
   92        provided, it tries to get the samples from the class's parameters using the `get_param` method
   93        :type samples: list
   94        :return: The `samples` list is being returned.
   95        """
   96
   97        if not samples:
   98            samples = self.get_param().get("samples", {}).get("list", None)
   99
  100        self.samples = samples
  101
  102        return samples
  103
  104    def get_samples(self) -> list:
  105        """
  106        This function returns a list of samples.
  107        :return: The `get_samples` method is returning the `samples` attribute of the object.
  108        """
  109
  110        return self.samples
  111
  112    def get_samples_check(self) -> bool:
  113        """
  114        This function returns the value of the "check" key within the "samples" dictionary retrieved
  115        from the parameters.
  116        :return: The method `get_samples_check` is returning the value of the key "check" inside the
  117        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
  118        method. If the key "check" is not found, it will return `False`.
  119        """
  120
  121        return self.get_param().get("samples", {}).get("check", True)
  122
  123    def set_input(self, input: str = None) -> None:
  124        """
  125        The function `set_input` takes a file name as input, extracts the name and extension, and sets
  126        attributes in the class accordingly.
  127
  128        :param input: The `set_input` method in the provided code snippet is used to set attributes
  129        related to the input file. Here's a breakdown of the parameters and their usage in the method:
  130        :type input: str
  131        """
  132
  133        if input and not isinstance(input, str):
  134            try:
  135                self.input = input.name
  136            except:
  137                log.error(f"Input file '{input} in bad format")
  138                raise ValueError(f"Input file '{input} in bad format")
  139        else:
  140            self.input = input
  141
  142        # Input format
  143        if input:
  144            input_name, input_extension = os.path.splitext(self.input)
  145            self.input_name = input_name
  146            self.input_extension = input_extension
  147            self.input_format = self.input_extension.replace(".", "")
  148
  149    def set_config(self, config: dict) -> None:
  150        """
  151        The set_config function takes a config object and assigns it as the configuration object for the
  152        class.
  153
  154        :param config: The `config` parameter in the `set_config` function is a dictionary object that
  155        contains configuration settings for the class. When you call the `set_config` function with a
  156        dictionary object as the argument, it will set that dictionary as the configuration object for
  157        the class
  158        :type config: dict
  159        """
  160
  161        self.config = config
  162
  163    def set_param(self, param: dict) -> None:
  164        """
  165        This function sets a parameter object for the class based on the input dictionary.
  166
  167        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
  168        as the `param` attribute of the class instance
  169        :type param: dict
  170        """
  171
  172        self.param = param
  173
  174    def init_variables(self) -> None:
  175        """
  176        This function initializes the variables that will be used in the rest of the class
  177        """
  178
  179        self.prefix = "howard"
  180        self.table_variants = "variants"
  181        self.dataframe = None
  182
  183        self.comparison_map = {
  184            "gt": ">",
  185            "gte": ">=",
  186            "lt": "<",
  187            "lte": "<=",
  188            "equals": "=",
  189            "contains": "SIMILAR TO",
  190        }
  191
  192        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
  193
  194        self.code_type_map_to_sql = {
  195            "Integer": "INTEGER",
  196            "String": "VARCHAR",
  197            "Float": "FLOAT",
  198            "Flag": "VARCHAR",
  199        }
  200
  201        self.index_additionnal_fields = []
  202
  203    def get_indexing(self) -> bool:
  204        """
  205        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
  206        returns False.
  207        :return: The value of the indexing parameter.
  208        """
  209
  210        return self.get_param().get("indexing", False)
  211
  212    def get_connexion_config(self) -> dict:
  213        """
  214        The function `get_connexion_config` returns a dictionary containing the configuration for a
  215        connection, including the number of threads and memory limit.
  216        :return: a dictionary containing the configuration for the Connexion library.
  217        """
  218
  219        # config
  220        config = self.get_config()
  221
  222        # Connexion config
  223        connexion_config = {}
  224        threads = self.get_threads()
  225
  226        # Threads
  227        if threads:
  228            connexion_config["threads"] = threads
  229
  230        # Memory
  231        # if config.get("memory", None):
  232        #     connexion_config["memory_limit"] = config.get("memory")
  233        if self.get_memory():
  234            connexion_config["memory_limit"] = self.get_memory()
  235
  236        # Temporary directory
  237        if config.get("tmp", None):
  238            connexion_config["temp_directory"] = config.get("tmp")
  239
  240        # Access
  241        if config.get("access", None):
  242            access = config.get("access")
  243            if access in ["RO"]:
  244                access = "READ_ONLY"
  245            elif access in ["RW"]:
  246                access = "READ_WRITE"
  247            connexion_db = self.get_connexion_db()
  248            if connexion_db in ":memory:":
  249                access = "READ_WRITE"
  250            connexion_config["access_mode"] = access
  251
  252        return connexion_config
  253
  254    def get_duckdb_settings(self) -> dict:
  255        """
  256        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
  257        string.
  258        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
  259        """
  260
  261        # config
  262        config = self.get_config()
  263
  264        # duckdb settings
  265        duckdb_settings_dict = {}
  266        if config.get("duckdb_settings", None):
  267            duckdb_settings = config.get("duckdb_settings")
  268            duckdb_settings = full_path(duckdb_settings)
  269            # duckdb setting is a file
  270            if os.path.exists(duckdb_settings):
  271                with open(duckdb_settings) as json_file:
  272                    duckdb_settings_dict = yaml.safe_load(json_file)
  273            # duckdb settings is a string
  274            else:
  275                duckdb_settings_dict = json.loads(duckdb_settings)
  276
  277        return duckdb_settings_dict
  278
  279    def set_connexion_db(self) -> str:
  280        """
  281        The function `set_connexion_db` returns the appropriate database connection string based on the
  282        input format and connection type.
  283        :return: the value of the variable `connexion_db`.
  284        """
  285
  286        # Default connexion db
  287        default_connexion_db = ":memory:"
  288
  289        # Find connexion db
  290        if self.get_input_format() in ["db", "duckdb"]:
  291            connexion_db = self.get_input()
  292        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
  293            connexion_db = default_connexion_db
  294        elif self.get_connexion_type() in ["tmpfile"]:
  295            tmp_name = tempfile.mkdtemp(
  296                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
  297            )
  298            connexion_db = f"{tmp_name}/tmp.db"
  299        elif self.get_connexion_type() != "":
  300            connexion_db = self.get_connexion_type()
  301        else:
  302            connexion_db = default_connexion_db
  303
  304        # Set connexion db
  305        self.connexion_db = connexion_db
  306
  307        return connexion_db
  308
  309    def set_connexion(self, conn) -> None:
  310        """
  311        The function `set_connexion` creates a connection to a database, with options for different
  312        database formats and settings.
  313
  314        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
  315        database. If a connection is not provided, a new connection to an in-memory database is created.
  316        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
  317        sqlite
  318        """
  319
  320        # Connexion db
  321        connexion_db = self.set_connexion_db()
  322
  323        # Connexion config
  324        connexion_config = self.get_connexion_config()
  325
  326        # Connexion format
  327        connexion_format = self.get_config().get("connexion_format", "duckdb")
  328        # Set connexion format
  329        self.connexion_format = connexion_format
  330
  331        # Connexion
  332        if not conn:
  333            if connexion_format in ["duckdb"]:
  334                conn = duckdb.connect(connexion_db, config=connexion_config)
  335                # duckDB settings
  336                duckdb_settings = self.get_duckdb_settings()
  337                if duckdb_settings:
  338                    for setting in duckdb_settings:
  339                        setting_value = duckdb_settings.get(setting)
  340                        if isinstance(setting_value, str):
  341                            setting_value = f"'{setting_value}'"
  342                        conn.execute(f"PRAGMA {setting}={setting_value};")
  343            elif connexion_format in ["sqlite"]:
  344                conn = sqlite3.connect(connexion_db)
  345
  346        # Set connexion
  347        self.conn = conn
  348
  349        # Log
  350        log.debug(f"connexion_format: {connexion_format}")
  351        log.debug(f"connexion_db: {connexion_db}")
  352        log.debug(f"connexion config: {connexion_config}")
  353        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
  354
  355    def set_output(self, output: str = None) -> None:
  356        """
  357        The `set_output` function in Python sets the output file based on the input or a specified key
  358        in the config file, extracting the output name, extension, and format.
  359
  360        :param output: The `output` parameter in the `set_output` method is used to specify the name of
  361        the output file. If the config file has an 'output' key, the method sets the output to the value
  362        of that key. If no output is provided, it sets the output to `None`
  363        :type output: str
  364        """
  365
  366        if output and not isinstance(output, str):
  367            self.output = output.name
  368        else:
  369            self.output = output
  370
  371        # Output format
  372        if self.output:
  373            output_name, output_extension = os.path.splitext(self.output)
  374            self.output_name = output_name
  375            self.output_extension = output_extension
  376            self.output_format = self.output_extension.replace(".", "")
  377        else:
  378            self.output_name = None
  379            self.output_extension = None
  380            self.output_format = None
  381
  382    def set_header(self) -> None:
  383        """
  384        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
  385        """
  386
  387        input_file = self.get_input()
  388        default_header_list = [
  389            "##fileformat=VCFv4.2",
  390            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
  391        ]
  392
  393        # Full path
  394        input_file = full_path(input_file)
  395
  396        if input_file:
  397
  398            input_format = self.get_input_format()
  399            input_compressed = self.get_input_compressed()
  400            config = self.get_config()
  401            header_list = default_header_list
  402            if input_format in [
  403                "vcf",
  404                "hdr",
  405                "tsv",
  406                "csv",
  407                "psv",
  408                "parquet",
  409                "db",
  410                "duckdb",
  411            ]:
  412                # header provided in param
  413                if config.get("header_file", None):
  414                    with open(config.get("header_file"), "rt") as f:
  415                        header_list = self.read_vcf_header(f)
  416                # within a vcf file format (header within input file itsself)
  417                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
  418                    # within a compressed vcf file format (.vcf.gz)
  419                    if input_compressed:
  420                        with bgzf.open(input_file, "rt") as f:
  421                            header_list = self.read_vcf_header(f)
  422                    # within an uncompressed vcf file format (.vcf)
  423                    else:
  424                        with open(input_file, "rt") as f:
  425                            header_list = self.read_vcf_header(f)
  426                # header provided in default external file .hdr
  427                elif os.path.exists((input_file + ".hdr")):
  428                    with open(input_file + ".hdr", "rt") as f:
  429                        header_list = self.read_vcf_header(f)
  430                else:
  431                    try:  # Try to get header info fields and file columns
  432
  433                        with tempfile.TemporaryDirectory() as tmpdir:
  434
  435                            # Create database
  436                            db_for_header = Database(database=input_file)
  437
  438                            # Get header columns for infos fields
  439                            db_header_from_columns = (
  440                                db_for_header.get_header_from_columns()
  441                            )
  442
  443                            # Get real columns in the file
  444                            db_header_columns = db_for_header.get_columns()
  445
  446                            # Write header file
  447                            header_file_tmp = os.path.join(tmpdir, "header")
  448                            f = open(header_file_tmp, "w")
  449                            vcf.Writer(f, db_header_from_columns)
  450                            f.close()
  451
  452                            # Replace #CHROM line with rel columns
  453                            header_list = db_for_header.read_header_file(
  454                                header_file=header_file_tmp
  455                            )
  456                            header_list[-1] = "\t".join(db_header_columns)
  457
  458                    except:
  459
  460                        log.warning(
  461                            f"No header for file {input_file}. Set as default VCF header"
  462                        )
  463                        header_list = default_header_list
  464
  465            else:  # try for unknown format ?
  466
  467                log.error(f"Input file format '{input_format}' not available")
  468                raise ValueError(f"Input file format '{input_format}' not available")
  469
  470            if not header_list:
  471                header_list = default_header_list
  472
  473            # header as list
  474            self.header_list = header_list
  475
  476            # header as VCF object
  477            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
  478
  479        else:
  480
  481            self.header_list = None
  482            self.header_vcf = None
  483
  484    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
  485        """
  486        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
  487        DataFrame based on the connection format.
  488
  489        :param query: The `query` parameter in the `get_query_to_df` function is a string that
  490        represents the SQL query you want to execute. This query will be used to fetch data from a
  491        database and convert it into a pandas DataFrame
  492        :type query: str
  493        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
  494        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
  495        function will only fetch up to that number of rows from the database query result. If no limit
  496        is specified,
  497        :type limit: int
  498        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
  499        """
  500
  501        # Connexion format
  502        connexion_format = self.get_connexion_format()
  503
  504        # Limit in query
  505        if limit:
  506            pd.set_option("display.max_rows", limit)
  507            if connexion_format in ["duckdb"]:
  508                df = (
  509                    self.conn.execute(query)
  510                    .fetch_record_batch(limit)
  511                    .read_next_batch()
  512                    .to_pandas()
  513                )
  514            elif connexion_format in ["sqlite"]:
  515                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
  516
  517        # Full query
  518        else:
  519            if connexion_format in ["duckdb"]:
  520                df = self.conn.execute(query).df()
  521            elif connexion_format in ["sqlite"]:
  522                df = pd.read_sql_query(query, self.conn)
  523
  524        return df
  525
  526    def get_overview(self) -> None:
  527        """
  528        The function prints the input, output, config, and dataframe of the current object
  529        """
  530        table_variants_from = self.get_table_variants(clause="from")
  531        sql_columns = self.get_header_columns_as_sql()
  532        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
  533        df = self.get_query_to_df(sql_query_export)
  534        log.info(
  535            "Input:  "
  536            + str(self.get_input())
  537            + " ["
  538            + str(str(self.get_input_format()))
  539            + "]"
  540        )
  541        log.info(
  542            "Output: "
  543            + str(self.get_output())
  544            + " ["
  545            + str(str(self.get_output_format()))
  546            + "]"
  547        )
  548        log.info("Config: ")
  549        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
  550            "\n"
  551        ):
  552            log.info("\t" + str(d))
  553        log.info("Param: ")
  554        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
  555            "\n"
  556        ):
  557            log.info("\t" + str(d))
  558        log.info("Sample list: " + str(self.get_header_sample_list()))
  559        log.info("Dataframe: ")
  560        for d in str(df).split("\n"):
  561            log.info("\t" + str(d))
  562
  563        # garbage collector
  564        del df
  565        gc.collect()
  566
  567        return None
  568
  569    def get_stats(self) -> dict:
  570        """
  571        The `get_stats` function calculates and returns various statistics of the current object,
  572        including information about the input file, variants, samples, header fields, quality, and
  573        SNVs/InDels.
  574        :return: a dictionary containing various statistics of the current object. The dictionary has
  575        the following structure:
  576        """
  577
  578        # Log
  579        log.info(f"Stats Calculation...")
  580
  581        # table varaints
  582        table_variants_from = self.get_table_variants()
  583
  584        # stats dict
  585        stats = {"Infos": {}}
  586
  587        ### File
  588        input_file = self.get_input()
  589        stats["Infos"]["Input file"] = input_file
  590
  591        # Header
  592        header_infos = self.get_header().infos
  593        header_formats = self.get_header().formats
  594        header_infos_list = list(header_infos)
  595        header_formats_list = list(header_formats)
  596
  597        ### Variants
  598
  599        stats["Variants"] = {}
  600
  601        # Variants by chr
  602        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
  603        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
  604        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
  605            by=["CHROM"], kind="quicksort"
  606        )
  607
  608        # Total number of variants
  609        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
  610
  611        # Calculate percentage
  612        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
  613            lambda x: (x / nb_of_variants)
  614        )
  615
  616        stats["Variants"]["Number of variants by chromosome"] = (
  617            nb_of_variants_by_chrom.to_dict(orient="index")
  618        )
  619
  620        stats["Infos"]["Number of variants"] = int(nb_of_variants)
  621
  622        ### Samples
  623
  624        # Init
  625        samples = {}
  626        nb_of_samples = 0
  627
  628        # Check Samples
  629        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
  630            log.debug(f"Check samples...")
  631            for sample in self.get_header_sample_list():
  632                sql_query_samples = f"""
  633                    SELECT  '{sample}' as sample,
  634                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
  635                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
  636                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
  637                    FROM {table_variants_from}
  638                    WHERE (
  639                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
  640                        AND
  641                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
  642                      )
  643                    GROUP BY genotype
  644                    """
  645                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
  646                sample_genotype_count = sql_query_genotype_df["count"].sum()
  647                if len(sql_query_genotype_df):
  648                    nb_of_samples += 1
  649                    samples[f"{sample} - {sample_genotype_count} variants"] = (
  650                        sql_query_genotype_df.to_dict(orient="index")
  651                    )
  652
  653            stats["Samples"] = samples
  654            stats["Infos"]["Number of samples"] = nb_of_samples
  655
  656        # #
  657        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
  658        #     stats["Infos"]["Number of samples"] = nb_of_samples
  659        # elif nb_of_samples:
  660        #     stats["Infos"]["Number of samples"] = "not a VCF format"
  661
  662        ### INFO and FORMAT fields
  663        header_types_df = {}
  664        header_types_list = {
  665            "List of INFO fields": header_infos,
  666            "List of FORMAT fields": header_formats,
  667        }
  668        i = 0
  669        for header_type in header_types_list:
  670
  671            header_type_infos = header_types_list.get(header_type)
  672            header_infos_dict = {}
  673
  674            for info in header_type_infos:
  675
  676                i += 1
  677                header_infos_dict[i] = {}
  678
  679                # ID
  680                header_infos_dict[i]["id"] = info
  681
  682                # num
  683                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
  684                if header_type_infos[info].num in genotype_map.keys():
  685                    header_infos_dict[i]["Number"] = genotype_map.get(
  686                        header_type_infos[info].num
  687                    )
  688                else:
  689                    header_infos_dict[i]["Number"] = header_type_infos[info].num
  690
  691                # type
  692                if header_type_infos[info].type:
  693                    header_infos_dict[i]["Type"] = header_type_infos[info].type
  694                else:
  695                    header_infos_dict[i]["Type"] = "."
  696
  697                # desc
  698                if header_type_infos[info].desc != None:
  699                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
  700                else:
  701                    header_infos_dict[i]["Description"] = ""
  702
  703            if len(header_infos_dict):
  704                header_types_df[header_type] = pd.DataFrame.from_dict(
  705                    header_infos_dict, orient="index"
  706                ).to_dict(orient="index")
  707
  708        # Stats
  709        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
  710        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
  711        stats["Header"] = header_types_df
  712
  713        ### QUAL
  714        if "QUAL" in self.get_header_columns():
  715            sql_query_qual = f"""
  716                    SELECT
  717                        avg(CAST(QUAL AS INTEGER)) AS Average,
  718                        min(CAST(QUAL AS INTEGER)) AS Minimum,
  719                        max(CAST(QUAL AS INTEGER)) AS Maximum,
  720                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
  721                        median(CAST(QUAL AS INTEGER)) AS Median,
  722                        variance(CAST(QUAL AS INTEGER)) AS Variance
  723                    FROM {table_variants_from}
  724                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
  725                    """
  726
  727            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
  728            stats["Quality"] = {"Stats": qual}
  729
  730        ### SNV and InDel
  731
  732        sql_query_snv = f"""
  733            
  734            SELECT Type, count FROM (
  735
  736                    SELECT
  737                        'Total' AS Type,
  738                        count(*) AS count
  739                    FROM {table_variants_from}
  740
  741                    UNION
  742
  743                    SELECT
  744                        'MNV' AS Type,
  745                        count(*) AS count
  746                    FROM {table_variants_from}
  747                    WHERE len(REF) > 1 AND len(ALT) > 1
  748                    AND len(REF) = len(ALT)
  749
  750                    UNION
  751
  752                    SELECT
  753                        'InDel' AS Type,
  754                        count(*) AS count
  755                    FROM {table_variants_from}
  756                    WHERE len(REF) > 1 OR len(ALT) > 1
  757                    AND len(REF) != len(ALT)
  758                    
  759                    UNION
  760
  761                    SELECT
  762                        'SNV' AS Type,
  763                        count(*) AS count
  764                    FROM {table_variants_from}
  765                    WHERE len(REF) = 1 AND len(ALT) = 1
  766
  767                )
  768
  769            ORDER BY count DESC
  770
  771                """
  772        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
  773
  774        sql_query_snv_substitution = f"""
  775                SELECT
  776                    concat(REF, '>', ALT) AS 'Substitution',
  777                    count(*) AS count
  778                FROM {table_variants_from}
  779                WHERE len(REF) = 1 AND len(ALT) = 1
  780                GROUP BY REF, ALT
  781                ORDER BY count(*) DESC
  782                """
  783        snv_substitution = (
  784            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
  785        )
  786        stats["Variants"]["Counts"] = snv_indel
  787        stats["Variants"]["Substitutions"] = snv_substitution
  788
  789        return stats
  790
  791    def stats_to_file(self, file: str = None) -> str:
  792        """
  793        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
  794        into a JSON object, and writes the JSON object to the specified file.
  795
  796        :param file: The `file` parameter is a string that represents the file path where the JSON data
  797        will be written
  798        :type file: str
  799        :return: the name of the file that was written to.
  800        """
  801
  802        # Get stats
  803        stats = self.get_stats()
  804
  805        # Serializing json
  806        json_object = json.dumps(stats, indent=4)
  807
  808        # Writing to sample.json
  809        with open(file, "w") as outfile:
  810            outfile.write(json_object)
  811
  812        return file
  813
  814    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
  815        """
  816        The `print_stats` function generates a markdown file and prints the statistics contained in a
  817        JSON file in a formatted manner.
  818
  819        :param output_file: The `output_file` parameter is a string that specifies the path and filename
  820        of the output file where the stats will be printed in Markdown format. If no `output_file` is
  821        provided, a temporary directory will be created and the stats will be saved in a file named
  822        "stats.md" within that
  823        :type output_file: str
  824        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
  825        file where the statistics will be saved. If no value is provided, a temporary directory will be
  826        created and a default file name "stats.json" will be used
  827        :type json_file: str
  828        :return: The function `print_stats` does not return any value. It has a return type annotation
  829        of `None`.
  830        """
  831
  832        # Full path
  833        output_file = full_path(output_file)
  834        json_file = full_path(json_file)
  835
  836        with tempfile.TemporaryDirectory() as tmpdir:
  837
  838            # Files
  839            if not output_file:
  840                output_file = os.path.join(tmpdir, "stats.md")
  841            if not json_file:
  842                json_file = os.path.join(tmpdir, "stats.json")
  843
  844            # Create folders
  845            if not os.path.exists(os.path.dirname(output_file)):
  846                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
  847            if not os.path.exists(os.path.dirname(json_file)):
  848                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
  849
  850            # Create stats JSON file
  851            stats_file = self.stats_to_file(file=json_file)
  852
  853            # Print stats file
  854            with open(stats_file) as f:
  855                stats = yaml.safe_load(f)
  856
  857            # Output
  858            output_title = []
  859            output_index = []
  860            output = []
  861
  862            # Title
  863            output_title.append("# HOWARD Stats")
  864
  865            # Index
  866            output_index.append("## Index")
  867
  868            # Process sections
  869            for section in stats:
  870                infos = stats.get(section)
  871                section_link = "#" + section.lower().replace(" ", "-")
  872                output.append(f"## {section}")
  873                output_index.append(f"- [{section}]({section_link})")
  874
  875                if len(infos):
  876                    for info in infos:
  877                        try:
  878                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
  879                            is_df = True
  880                        except:
  881                            try:
  882                                df = pd.DataFrame.from_dict(
  883                                    json.loads((infos.get(info))), orient="index"
  884                                )
  885                                is_df = True
  886                            except:
  887                                is_df = False
  888                        if is_df:
  889                            output.append(f"### {info}")
  890                            info_link = "#" + info.lower().replace(" ", "-")
  891                            output_index.append(f"   - [{info}]({info_link})")
  892                            output.append(f"{df.to_markdown(index=False)}")
  893                        else:
  894                            output.append(f"- {info}: {infos.get(info)}")
  895                else:
  896                    output.append(f"NA")
  897
  898            # Write stats in markdown file
  899            with open(output_file, "w") as fp:
  900                for item in output_title:
  901                    fp.write("%s\n" % item)
  902                for item in output_index:
  903                    fp.write("%s\n" % item)
  904                for item in output:
  905                    fp.write("%s\n" % item)
  906
  907            # Output stats in markdown
  908            print("")
  909            print("\n\n".join(output_title))
  910            print("")
  911            print("\n\n".join(output))
  912            print("")
  913
  914        return None
  915
  916    def get_input(self) -> str:
  917        """
  918        It returns the value of the input variable.
  919        :return: The input is being returned.
  920        """
  921        return self.input
  922
  923    def get_input_format(self, input_file: str = None) -> str:
  924        """
  925        This function returns the format of the input variable, either from the provided input file or
  926        by prompting for input.
  927
  928        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
  929        represents the file path of the input file. If no `input_file` is provided when calling the
  930        method, it will default to `None`
  931        :type input_file: str
  932        :return: The format of the input variable is being returned.
  933        """
  934
  935        if not input_file:
  936            input_file = self.get_input()
  937        input_format = get_file_format(input_file)
  938        return input_format
  939
  940    def get_input_compressed(self, input_file: str = None) -> str:
  941        """
  942        The function `get_input_compressed` returns the format of the input variable after compressing
  943        it.
  944
  945        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
  946        that represents the file path of the input file. If no `input_file` is provided when calling the
  947        method, it will default to `None` and the method will then call `self.get_input()` to
  948        :type input_file: str
  949        :return: The function `get_input_compressed` returns the compressed format of the input
  950        variable.
  951        """
  952
  953        if not input_file:
  954            input_file = self.get_input()
  955        input_compressed = get_file_compressed(input_file)
  956        return input_compressed
  957
  958    def get_output(self) -> str:
  959        """
  960        It returns the output of the neuron.
  961        :return: The output of the neural network.
  962        """
  963
  964        return self.output
  965
  966    def get_output_format(self, output_file: str = None) -> str:
  967        """
  968        The function `get_output_format` returns the format of the input variable or the output file if
  969        provided.
  970
  971        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
  972        that represents the file path of the output file. If no `output_file` is provided when calling
  973        the method, it will default to the output obtained from the `get_output` method of the class
  974        instance. The
  975        :type output_file: str
  976        :return: The format of the input variable is being returned.
  977        """
  978
  979        if not output_file:
  980            output_file = self.get_output()
  981        output_format = get_file_format(output_file)
  982
  983        return output_format
  984
  985    def get_config(self) -> dict:
  986        """
  987        It returns the config
  988        :return: The config variable is being returned.
  989        """
  990        return self.config
  991
  992    def get_param(self) -> dict:
  993        """
  994        It returns the param
  995        :return: The param variable is being returned.
  996        """
  997        return self.param
  998
  999    def get_connexion_db(self) -> str:
 1000        """
 1001        It returns the connexion_db attribute of the object
 1002        :return: The connexion_db is being returned.
 1003        """
 1004        return self.connexion_db
 1005
 1006    def get_prefix(self) -> str:
 1007        """
 1008        It returns the prefix of the object.
 1009        :return: The prefix is being returned.
 1010        """
 1011        return self.prefix
 1012
 1013    def get_table_variants(self, clause: str = "select") -> str:
 1014        """
 1015        This function returns the table_variants attribute of the object
 1016
 1017        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
 1018        defaults to select (optional)
 1019        :return: The table_variants attribute of the object.
 1020        """
 1021
 1022        # Access
 1023        access = self.get_config().get("access", None)
 1024
 1025        # Clauses "select", "where", "update"
 1026        if clause in ["select", "where", "update"]:
 1027            table_variants = self.table_variants
 1028        # Clause "from"
 1029        elif clause in ["from"]:
 1030            # For Read Only
 1031            if self.get_input_format() in ["parquet"] and access in ["RO"]:
 1032                input_file = self.get_input()
 1033                table_variants = f"'{input_file}' as variants"
 1034            # For Read Write
 1035            else:
 1036                table_variants = f"{self.table_variants} as variants"
 1037        else:
 1038            table_variants = self.table_variants
 1039        return table_variants
 1040
 1041    def get_tmp_dir(self) -> str:
 1042        """
 1043        The function `get_tmp_dir` returns the temporary directory path based on configuration
 1044        parameters or a default path.
 1045        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
 1046        configuration, parameters, and a default value of "/tmp".
 1047        """
 1048
 1049        return get_tmp(
 1050            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
 1051        )
 1052
 1053    def get_connexion_type(self) -> str:
 1054        """
 1055        If the connexion type is not in the list of allowed connexion types, raise a ValueError
 1056
 1057        :return: The connexion type is being returned.
 1058        """
 1059        return self.get_config().get("connexion_type", "memory")
 1060
 1061    def get_connexion(self):
 1062        """
 1063        It returns the connection object
 1064
 1065        :return: The connection object.
 1066        """
 1067        return self.conn
 1068
 1069    def close_connexion(self) -> None:
 1070        """
 1071        This function closes the connection to the database.
 1072        :return: The connection is being closed.
 1073        """
 1074        return self.conn.close()
 1075
 1076    def get_header(self, type: str = "vcf"):
 1077        """
 1078        This function returns the header of the VCF file as a list of strings
 1079
 1080        :param type: the type of header you want to get, defaults to vcf (optional)
 1081        :return: The header of the vcf file.
 1082        """
 1083
 1084        if self.header_vcf:
 1085            if type == "vcf":
 1086                return self.header_vcf
 1087            elif type == "list":
 1088                return self.header_list
 1089        else:
 1090            if type == "vcf":
 1091                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 1092                return header
 1093            elif type == "list":
 1094                return vcf_required
 1095
 1096    def get_header_infos_list(self) -> list:
 1097        """
 1098        This function retrieves a list of information fields from the header.
 1099        :return: A list of information fields from the header.
 1100        """
 1101
 1102        # Init
 1103        infos_list = []
 1104
 1105        for field in self.get_header().infos:
 1106            infos_list.append(field)
 1107
 1108        return infos_list
 1109
 1110    def get_header_length(self, file: str = None) -> int:
 1111        """
 1112        The function `get_header_length` returns the length of the header list, excluding the #CHROM
 1113        line.
 1114
 1115        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
 1116        header file. If this argument is provided, the function will read the header from the specified
 1117        file and return the length of the header list minus 1 (to exclude the #CHROM line)
 1118        :type file: str
 1119        :return: the length of the header list, excluding the #CHROM line.
 1120        """
 1121
 1122        if file:
 1123            return len(self.read_vcf_header_file(file=file)) - 1
 1124        elif self.get_header(type="list"):
 1125            return len(self.get_header(type="list")) - 1
 1126        else:
 1127            return 0
 1128
 1129    def get_header_columns(self) -> str:
 1130        """
 1131        This function returns the header list of a VCF
 1132
 1133        :return: The length of the header list.
 1134        """
 1135        if self.get_header():
 1136            return self.get_header(type="list")[-1]
 1137        else:
 1138            return ""
 1139
 1140    def get_header_columns_as_list(self) -> list:
 1141        """
 1142        This function returns the header list of a VCF
 1143
 1144        :return: The length of the header list.
 1145        """
 1146        if self.get_header():
 1147            return self.get_header_columns().strip().split("\t")
 1148        else:
 1149            return []
 1150
 1151    def get_header_columns_as_sql(self) -> str:
 1152        """
 1153        This function retruns header length (without #CHROM line)
 1154
 1155        :return: The length of the header list.
 1156        """
 1157        sql_column_list = []
 1158        for col in self.get_header_columns_as_list():
 1159            sql_column_list.append(f'"{col}"')
 1160        return ",".join(sql_column_list)
 1161
 1162    def get_header_sample_list(
 1163        self, check: bool = False, samples: list = None, samples_force: bool = False
 1164    ) -> list:
 1165        """
 1166        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
 1167        checking and filtering based on input parameters.
 1168
 1169        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
 1170        parameter that determines whether to check if the samples in the list are properly defined as
 1171        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
 1172        list is defined as a, defaults to False
 1173        :type check: bool (optional)
 1174        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
 1175        allows you to specify a subset of samples from the header. If you provide a list of sample
 1176        names, the function will check if each sample is defined in the header. If a sample is not found
 1177        in the
 1178        :type samples: list
 1179        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
 1180        a boolean parameter that determines whether to force the function to return the sample list
 1181        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
 1182        function will return the sample list without performing, defaults to False
 1183        :type samples_force: bool (optional)
 1184        :return: The function `get_header_sample_list` returns a list of samples based on the input
 1185        parameters and conditions specified in the function.
 1186        """
 1187
 1188        # Init
 1189        samples_list = []
 1190
 1191        if samples is None:
 1192            samples_list = self.header_vcf.samples
 1193        else:
 1194            samples_checked = []
 1195            for sample in samples:
 1196                if sample in self.header_vcf.samples:
 1197                    samples_checked.append(sample)
 1198                else:
 1199                    log.warning(f"Sample '{sample}' not defined in header")
 1200            samples_list = samples_checked
 1201
 1202            # Force sample list without checking if is_genotype_column
 1203            if samples_force:
 1204                log.warning(f"Samples {samples_list} not checked if genotypes")
 1205                return samples_list
 1206
 1207        if check:
 1208            samples_checked = []
 1209            for sample in samples_list:
 1210                if self.is_genotype_column(column=sample):
 1211                    samples_checked.append(sample)
 1212                else:
 1213                    log.warning(
 1214                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
 1215                    )
 1216            samples_list = samples_checked
 1217
 1218        # Return samples list
 1219        return samples_list
 1220
 1221    def is_genotype_column(self, column: str = None) -> bool:
 1222        """
 1223        This function checks if a given column is a genotype column in a database.
 1224
 1225        :param column: The `column` parameter in the `is_genotype_column` method is a string that
 1226        represents the column name in a database table. This method checks if the specified column is a
 1227        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
 1228        method of
 1229        :type column: str
 1230        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
 1231        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
 1232        column name and returns the result. If the `column` parameter is None, it returns False.
 1233        """
 1234
 1235        if column is not None:
 1236            return Database(database=self.get_input()).is_genotype_column(column=column)
 1237        else:
 1238            return False
 1239
 1240    def get_verbose(self) -> bool:
 1241        """
 1242        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
 1243        exist
 1244
 1245        :return: The value of the key "verbose" in the config dictionary.
 1246        """
 1247        return self.get_config().get("verbose", False)
 1248
 1249    def get_connexion_format(self) -> str:
 1250        """
 1251        It returns the connexion format of the object.
 1252        :return: The connexion_format is being returned.
 1253        """
 1254        connexion_format = self.connexion_format
 1255        if connexion_format not in ["duckdb", "sqlite"]:
 1256            log.error(f"Unknown connexion format {connexion_format}")
 1257            raise ValueError(f"Unknown connexion format {connexion_format}")
 1258        else:
 1259            return connexion_format
 1260
 1261    def insert_file_to_table(
 1262        self,
 1263        file,
 1264        columns: str,
 1265        header_len: int = 0,
 1266        sep: str = "\t",
 1267        chunksize: int = 1000000,
 1268    ) -> None:
 1269        """
 1270        The function reads a file in chunks and inserts each chunk into a table based on the specified
 1271        database format.
 1272
 1273        :param file: The `file` parameter is the file that you want to load into a table. It should be
 1274        the path to the file on your system
 1275        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
 1276        should contain the names of the columns in the table where the data will be inserted. The column
 1277        names should be separated by commas within the string. For example, if you have columns named
 1278        "id", "name
 1279        :type columns: str
 1280        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
 1281        the number of lines to skip at the beginning of the file before reading the actual data. This
 1282        parameter allows you to skip any header information present in the file before processing the
 1283        data, defaults to 0
 1284        :type header_len: int (optional)
 1285        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
 1286        separator character that is used in the file being read. In this case, the default separator is
 1287        set to `\t`, which represents a tab character. You can change this parameter to a different
 1288        separator character if, defaults to \t
 1289        :type sep: str (optional)
 1290        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
 1291        when processing the file in chunks. In the provided code snippet, the default value for
 1292        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
 1293        to 1000000
 1294        :type chunksize: int (optional)
 1295        """
 1296
 1297        # Config
 1298        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
 1299        connexion_format = self.get_connexion_format()
 1300
 1301        log.debug("chunksize: " + str(chunksize))
 1302
 1303        if chunksize:
 1304            for chunk in pd.read_csv(
 1305                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
 1306            ):
 1307                if connexion_format in ["duckdb"]:
 1308                    sql_insert_into = (
 1309                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
 1310                    )
 1311                    self.conn.execute(sql_insert_into)
 1312                elif connexion_format in ["sqlite"]:
 1313                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
 1314
 1315    def load_data(
 1316        self,
 1317        input_file: str = None,
 1318        drop_variants_table: bool = False,
 1319        sample_size: int = 20480,
 1320    ) -> None:
 1321        """
 1322        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
 1323        table before loading the data and specify a sample size.
 1324
 1325        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
 1326        table
 1327        :type input_file: str
 1328        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
 1329        determines whether the variants table should be dropped before loading the data. If set to
 1330        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
 1331        not be dropped, defaults to False
 1332        :type drop_variants_table: bool (optional)
 1333        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
 1334        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
 1335        20480
 1336        :type sample_size: int (optional)
 1337        """
 1338
 1339        log.info("Loading...")
 1340
 1341        # change input file
 1342        if input_file:
 1343            self.set_input(input_file)
 1344            self.set_header()
 1345
 1346        # drop variants table
 1347        if drop_variants_table:
 1348            self.drop_variants_table()
 1349
 1350        # get table variants
 1351        table_variants = self.get_table_variants()
 1352
 1353        # Access
 1354        access = self.get_config().get("access", None)
 1355        log.debug(f"access: {access}")
 1356
 1357        # Input format and compress
 1358        input_format = self.get_input_format()
 1359        input_compressed = self.get_input_compressed()
 1360        log.debug(f"input_format: {input_format}")
 1361        log.debug(f"input_compressed: {input_compressed}")
 1362
 1363        # input_compressed_format
 1364        if input_compressed:
 1365            input_compressed_format = "gzip"
 1366        else:
 1367            input_compressed_format = "none"
 1368        log.debug(f"input_compressed_format: {input_compressed_format}")
 1369
 1370        # Connexion format
 1371        connexion_format = self.get_connexion_format()
 1372
 1373        # Sample size
 1374        if not sample_size:
 1375            sample_size = -1
 1376        log.debug(f"sample_size: {sample_size}")
 1377
 1378        # Load data
 1379        log.debug(f"Load Data from {input_format}")
 1380
 1381        # DuckDB connexion
 1382        if connexion_format in ["duckdb"]:
 1383
 1384            # Database already exists
 1385            if self.input_format in ["db", "duckdb"]:
 1386
 1387                if connexion_format in ["duckdb"]:
 1388                    log.debug(f"Input file format '{self.input_format}' duckDB")
 1389                else:
 1390                    log.error(
 1391                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1392                    )
 1393                    raise ValueError(
 1394                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1395                    )
 1396
 1397            # Load from existing database format
 1398            else:
 1399
 1400                try:
 1401                    # Create Table or View
 1402                    database = Database(database=self.input)
 1403                    sql_from = database.get_sql_from(sample_size=sample_size)
 1404
 1405                    if access in ["RO"]:
 1406                        sql_load = (
 1407                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
 1408                        )
 1409                    else:
 1410                        sql_load = (
 1411                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
 1412                        )
 1413                    self.conn.execute(sql_load)
 1414
 1415                except:
 1416                    # Format not available
 1417                    log.error(f"Input file format '{self.input_format}' not available")
 1418                    raise ValueError(
 1419                        f"Input file format '{self.input_format}' not available"
 1420                    )
 1421
 1422        # SQLite connexion
 1423        elif connexion_format in ["sqlite"] and input_format in [
 1424            "vcf",
 1425            "tsv",
 1426            "csv",
 1427            "psv",
 1428        ]:
 1429
 1430            # Main structure
 1431            structure = {
 1432                "#CHROM": "VARCHAR",
 1433                "POS": "INTEGER",
 1434                "ID": "VARCHAR",
 1435                "REF": "VARCHAR",
 1436                "ALT": "VARCHAR",
 1437                "QUAL": "VARCHAR",
 1438                "FILTER": "VARCHAR",
 1439                "INFO": "VARCHAR",
 1440            }
 1441
 1442            # Strcuture with samples
 1443            structure_complete = structure
 1444            if self.get_header_sample_list():
 1445                structure["FORMAT"] = "VARCHAR"
 1446                for sample in self.get_header_sample_list():
 1447                    structure_complete[sample] = "VARCHAR"
 1448
 1449            # Columns list for create and insert
 1450            sql_create_table_columns = []
 1451            sql_create_table_columns_list = []
 1452            for column in structure_complete:
 1453                column_type = structure_complete[column]
 1454                sql_create_table_columns.append(
 1455                    f'"{column}" {column_type} default NULL'
 1456                )
 1457                sql_create_table_columns_list.append(f'"{column}"')
 1458
 1459            # Create database
 1460            log.debug(f"Create Table {table_variants}")
 1461            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
 1462            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
 1463            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
 1464            self.conn.execute(sql_create_table)
 1465
 1466            # chunksize define length of file chunk load file
 1467            chunksize = 100000
 1468
 1469            # delimiter
 1470            delimiter = file_format_delimiters.get(input_format, "\t")
 1471
 1472            # Load the input file
 1473            with open(self.input, "rt") as input_file:
 1474
 1475                # Use the appropriate file handler based on the input format
 1476                if input_compressed:
 1477                    input_file = bgzf.open(self.input, "rt")
 1478                if input_format in ["vcf"]:
 1479                    header_len = self.get_header_length()
 1480                else:
 1481                    header_len = 0
 1482
 1483                # Insert the file contents into a table
 1484                self.insert_file_to_table(
 1485                    input_file,
 1486                    columns=sql_create_table_columns_list_sql,
 1487                    header_len=header_len,
 1488                    sep=delimiter,
 1489                    chunksize=chunksize,
 1490                )
 1491
 1492        else:
 1493            log.error(
 1494                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1495            )
 1496            raise ValueError(
 1497                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1498            )
 1499
 1500        # Explode INFOS fields into table fields
 1501        if self.get_explode_infos():
 1502            self.explode_infos(
 1503                prefix=self.get_explode_infos_prefix(),
 1504                fields=self.get_explode_infos_fields(),
 1505                force=True,
 1506            )
 1507
 1508        # Create index after insertion
 1509        self.create_indexes()
 1510
 1511    def get_explode_infos(self) -> bool:
 1512        """
 1513        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
 1514        to False if it is not set.
 1515        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
 1516        value. If the parameter is not present, it will return False.
 1517        """
 1518
 1519        return self.get_param().get("explode", {}).get("explode_infos", False)
 1520
 1521    def get_explode_infos_fields(
 1522        self,
 1523        explode_infos_fields: str = None,
 1524        remove_fields_not_in_header: bool = False,
 1525    ) -> list:
 1526        """
 1527        The `get_explode_infos_fields` function returns a list of exploded information fields based on
 1528        the input parameter `explode_infos_fields`.
 1529
 1530        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
 1531        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
 1532        comma-separated list of field names to explode
 1533        :type explode_infos_fields: str
 1534        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
 1535        flag that determines whether to remove fields that are not present in the header. If it is set
 1536        to `True`, any field that is not in the header will be excluded from the list of exploded
 1537        information fields. If it is set to `, defaults to False
 1538        :type remove_fields_not_in_header: bool (optional)
 1539        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
 1540        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
 1541        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
 1542        Otherwise, it returns a list of exploded information fields after removing any spaces and
 1543        splitting the string by commas.
 1544        """
 1545
 1546        # If no fields, get it in param
 1547        if not explode_infos_fields:
 1548            explode_infos_fields = (
 1549                self.get_param().get("explode", {}).get("explode_infos_fields", None)
 1550            )
 1551
 1552        # If no fields, defined as all fields in header using keyword
 1553        if not explode_infos_fields:
 1554            explode_infos_fields = "*"
 1555
 1556        # If fields list not empty
 1557        if explode_infos_fields:
 1558
 1559            # Input fields list
 1560            if isinstance(explode_infos_fields, str):
 1561                fields_input = explode_infos_fields.split(",")
 1562            elif isinstance(explode_infos_fields, list):
 1563                fields_input = explode_infos_fields
 1564            else:
 1565                fields_input = []
 1566
 1567            # Fields list without * keyword
 1568            fields_without_all = fields_input.copy()
 1569            if "*".casefold() in (item.casefold() for item in fields_without_all):
 1570                fields_without_all.remove("*")
 1571
 1572            # Fields in header
 1573            fields_in_header = sorted(list(set(self.get_header().infos)))
 1574
 1575            # Construct list of fields
 1576            fields_output = []
 1577            for field in fields_input:
 1578
 1579                # Strip field
 1580                field = field.strip()
 1581
 1582                # format keyword * in regex
 1583                if field.upper() in ["*"]:
 1584                    field = ".*"
 1585
 1586                # Find all fields with pattern
 1587                r = re.compile(field)
 1588                fields_search = sorted(list(filter(r.match, fields_in_header)))
 1589
 1590                # Remove fields input from search
 1591                if field in fields_search:
 1592                    fields_search = [field]
 1593                elif fields_search != [field]:
 1594                    fields_search = sorted(
 1595                        list(set(fields_search).difference(fields_input))
 1596                    )
 1597
 1598                # If field is not in header (avoid not well formatted header)
 1599                if not fields_search and not remove_fields_not_in_header:
 1600                    fields_search = [field]
 1601
 1602                # Add found fields
 1603                for new_field in fields_search:
 1604                    # Add field, if not already exists, and if it is in header (if asked)
 1605                    if (
 1606                        new_field not in fields_output
 1607                        and (
 1608                            not remove_fields_not_in_header
 1609                            or new_field in fields_in_header
 1610                        )
 1611                        and new_field not in [".*"]
 1612                    ):
 1613                        fields_output.append(new_field)
 1614
 1615            return fields_output
 1616
 1617        else:
 1618
 1619            return []
 1620
 1621    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
 1622        """
 1623        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
 1624        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
 1625        not provided.
 1626
 1627        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
 1628        prefix to be used for exploding or expanding information
 1629        :type explode_infos_prefix: str
 1630        :return: the value of the variable `explode_infos_prefix`.
 1631        """
 1632
 1633        if not explode_infos_prefix:
 1634            explode_infos_prefix = (
 1635                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
 1636            )
 1637
 1638        return explode_infos_prefix
 1639
 1640    def add_column(
 1641        self,
 1642        table_name,
 1643        column_name,
 1644        column_type,
 1645        default_value=None,
 1646        drop: bool = False,
 1647    ) -> dict:
 1648        """
 1649        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
 1650        doesn't already exist.
 1651
 1652        :param table_name: The name of the table to which you want to add a column
 1653        :param column_name: The parameter "column_name" is the name of the column that you want to add
 1654        to the table
 1655        :param column_type: The `column_type` parameter specifies the data type of the column that you
 1656        want to add to the table. It should be a string that represents the desired data type, such as
 1657        "INTEGER", "TEXT", "REAL", etc
 1658        :param default_value: The `default_value` parameter is an optional parameter that specifies the
 1659        default value for the newly added column. If a default value is provided, it will be assigned to
 1660        the column for any existing rows that do not have a value for that column
 1661        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
 1662        if it already exists in the table. If `drop` is set to `True`, the function will drop the
 1663        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
 1664        to False
 1665        :type drop: bool (optional)
 1666        :return: a boolean value indicating whether the column was successfully added to the table.
 1667        """
 1668
 1669        # added
 1670        added = False
 1671        dropped = False
 1672
 1673        # Check if the column already exists in the table
 1674        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1675        columns = self.get_query_to_df(query).columns.tolist()
 1676        if column_name.upper() in [c.upper() for c in columns]:
 1677            log.debug(
 1678                f"The {column_name} column already exists in the {table_name} table"
 1679            )
 1680            if drop:
 1681                self.drop_column(table_name=table_name, column_name=column_name)
 1682                dropped = True
 1683            else:
 1684                return None
 1685        else:
 1686            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1687
 1688        # Add column in table
 1689        add_column_query = (
 1690            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
 1691        )
 1692        if default_value is not None:
 1693            add_column_query += f" DEFAULT {default_value}"
 1694        self.execute_query(add_column_query)
 1695        added = not dropped
 1696        log.debug(
 1697            f"The {column_name} column was successfully added to the {table_name} table"
 1698        )
 1699
 1700        if added:
 1701            added_column = {
 1702                "table_name": table_name,
 1703                "column_name": column_name,
 1704                "column_type": column_type,
 1705                "default_value": default_value,
 1706            }
 1707        else:
 1708            added_column = None
 1709
 1710        return added_column
 1711
 1712    def drop_column(
 1713        self, column: dict = None, table_name: str = None, column_name: str = None
 1714    ) -> bool:
 1715        """
 1716        The `drop_column` function drops a specified column from a given table in a database and returns
 1717        True if the column was successfully dropped, and False if the column does not exist in the
 1718        table.
 1719
 1720        :param column: The `column` parameter is a dictionary that contains information about the column
 1721        you want to drop. It has two keys:
 1722        :type column: dict
 1723        :param table_name: The `table_name` parameter is the name of the table from which you want to
 1724        drop a column
 1725        :type table_name: str
 1726        :param column_name: The `column_name` parameter is the name of the column that you want to drop
 1727        from the table
 1728        :type column_name: str
 1729        :return: a boolean value. It returns True if the column was successfully dropped from the table,
 1730        and False if the column does not exist in the table.
 1731        """
 1732
 1733        # Find column infos
 1734        if column:
 1735            if isinstance(column, dict):
 1736                table_name = column.get("table_name", None)
 1737                column_name = column.get("column_name", None)
 1738            elif isinstance(column, str):
 1739                table_name = self.get_table_variants()
 1740                column_name = column
 1741            else:
 1742                table_name = None
 1743                column_name = None
 1744
 1745        if not table_name and not column_name:
 1746            return False
 1747
 1748        # Removed
 1749        removed = False
 1750
 1751        # Check if the column already exists in the table
 1752        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1753        columns = self.get_query_to_df(query).columns.tolist()
 1754        if column_name in columns:
 1755            log.debug(f"The {column_name} column exists in the {table_name} table")
 1756        else:
 1757            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1758            return False
 1759
 1760        # Add column in table # ALTER TABLE integers DROP k
 1761        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
 1762        self.execute_query(add_column_query)
 1763        removed = True
 1764        log.debug(
 1765            f"The {column_name} column was successfully dropped to the {table_name} table"
 1766        )
 1767
 1768        return removed
 1769
 1770    def explode_infos(
 1771        self,
 1772        prefix: str = None,
 1773        create_index: bool = False,
 1774        fields: list = None,
 1775        force: bool = False,
 1776        proccess_all_fields_together: bool = False,
 1777        table: str = None,
 1778    ) -> list:
 1779        """
 1780        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
 1781        individual columns, returning a list of added columns.
 1782
 1783        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
 1784        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
 1785        `self.get_explode_infos_prefix()` as the prefix
 1786        :type prefix: str
 1787        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
 1788        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
 1789        `False`, indexes will not be created. The default value is `False`, defaults to False
 1790        :type create_index: bool (optional)
 1791        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
 1792        that you want to explode into individual columns. If this parameter is not provided, all INFO
 1793        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
 1794        a list to the `
 1795        :type fields: list
 1796        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
 1797        determines whether to drop and recreate a column if it already exists in the table. If `force`
 1798        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
 1799        defaults to False
 1800        :type force: bool (optional)
 1801        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
 1802        flag that determines whether to process all the INFO fields together or individually. If set to
 1803        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
 1804        be processed individually. The default value is, defaults to False
 1805        :type proccess_all_fields_together: bool (optional)
 1806        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
 1807        of the table where the exploded INFO fields will be added as individual columns. If you provide
 1808        a value for the `table` parameter, the function will use that table name. If the `table`
 1809        parameter is
 1810        :type table: str
 1811        :return: The `explode_infos` function returns a list of added columns.
 1812        """
 1813
 1814        # drop indexes
 1815        self.drop_indexes()
 1816
 1817        # connexion format
 1818        connexion_format = self.get_connexion_format()
 1819
 1820        # Access
 1821        access = self.get_config().get("access", None)
 1822
 1823        # Added columns
 1824        added_columns = []
 1825
 1826        if access not in ["RO"]:
 1827
 1828            # prefix
 1829            if prefix in [None, True] or not isinstance(prefix, str):
 1830                if self.get_explode_infos_prefix() not in [None, True]:
 1831                    prefix = self.get_explode_infos_prefix()
 1832                else:
 1833                    prefix = "INFO/"
 1834
 1835            # table variants
 1836            if table is not None:
 1837                table_variants = table
 1838            else:
 1839                table_variants = self.get_table_variants(clause="select")
 1840
 1841            # extra infos
 1842            try:
 1843                extra_infos = self.get_extra_infos()
 1844            except:
 1845                extra_infos = []
 1846
 1847            # Header infos
 1848            header_infos = self.get_header().infos
 1849
 1850            log.debug(
 1851                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
 1852            )
 1853
 1854            sql_info_alter_table_array = []
 1855
 1856            # Info fields to check
 1857            fields_list = list(header_infos)
 1858            if fields:
 1859                fields_list += fields
 1860            fields_list = set(fields_list)
 1861
 1862            # If no fields
 1863            if not fields:
 1864                fields = []
 1865
 1866            # Translate fields if patterns
 1867            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
 1868
 1869            for info in fields:
 1870
 1871                info_id_sql = prefix + info
 1872
 1873                if (
 1874                    info in fields_list
 1875                    or prefix + info in fields_list
 1876                    or info in extra_infos
 1877                ):
 1878
 1879                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
 1880
 1881                    if info in header_infos:
 1882                        info_type = header_infos[info].type
 1883                        info_num = header_infos[info].num
 1884                    else:
 1885                        info_type = "String"
 1886                        info_num = 0
 1887
 1888                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
 1889                    if info_num != 1:
 1890                        type_sql = "VARCHAR"
 1891
 1892                    # Add field
 1893                    added_column = self.add_column(
 1894                        table_name=table_variants,
 1895                        column_name=info_id_sql,
 1896                        column_type=type_sql,
 1897                        default_value="null",
 1898                        drop=force,
 1899                    )
 1900
 1901                    if added_column:
 1902                        added_columns.append(added_column)
 1903
 1904                    if added_column or force:
 1905
 1906                        # add field to index
 1907                        self.index_additionnal_fields.append(info_id_sql)
 1908
 1909                        # Update field array
 1910                        if connexion_format in ["duckdb"]:
 1911                            update_info_field = f"""
 1912                            "{info_id_sql}" =
 1913                                CASE
 1914                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
 1915                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
 1916                                END
 1917                            """
 1918                        elif connexion_format in ["sqlite"]:
 1919                            update_info_field = f"""
 1920                                "{info_id_sql}" =
 1921                                    CASE
 1922                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
 1923                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
 1924                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
 1925                                    END
 1926                            """
 1927
 1928                        sql_info_alter_table_array.append(update_info_field)
 1929
 1930            if sql_info_alter_table_array:
 1931
 1932                # By chromosomes
 1933                try:
 1934                    chromosomes_list = list(
 1935                        self.get_query_to_df(
 1936                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
 1937                        )["#CHROM"]
 1938                    )
 1939                except:
 1940                    chromosomes_list = [None]
 1941
 1942                for chrom in chromosomes_list:
 1943                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
 1944
 1945                    # Where clause
 1946                    where_clause = ""
 1947                    if chrom and len(chromosomes_list) > 1:
 1948                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
 1949
 1950                    # Update table
 1951                    if proccess_all_fields_together:
 1952                        sql_info_alter_table_array_join = ", ".join(
 1953                            sql_info_alter_table_array
 1954                        )
 1955                        if sql_info_alter_table_array_join:
 1956                            sql_info_alter_table = f"""
 1957                                UPDATE {table_variants}
 1958                                SET {sql_info_alter_table_array_join}
 1959                                {where_clause}
 1960                                """
 1961                            log.debug(
 1962                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
 1963                            )
 1964                            # log.debug(sql_info_alter_table)
 1965                            self.conn.execute(sql_info_alter_table)
 1966                    else:
 1967                        sql_info_alter_num = 0
 1968                        for sql_info_alter in sql_info_alter_table_array:
 1969                            sql_info_alter_num += 1
 1970                            sql_info_alter_table = f"""
 1971                                UPDATE {table_variants}
 1972                                SET {sql_info_alter}
 1973                                {where_clause}
 1974                                """
 1975                            log.debug(
 1976                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
 1977                            )
 1978                            # log.debug(sql_info_alter_table)
 1979                            self.conn.execute(sql_info_alter_table)
 1980
 1981        # create indexes
 1982        if create_index:
 1983            self.create_indexes()
 1984
 1985        return added_columns
 1986
 1987    def create_indexes(self) -> None:
 1988        """
 1989        Create indexes on the table after insertion
 1990        """
 1991
 1992        # Access
 1993        access = self.get_config().get("access", None)
 1994
 1995        # get table variants
 1996        table_variants = self.get_table_variants("FROM")
 1997
 1998        if self.get_indexing() and access not in ["RO"]:
 1999            # Create index
 2000            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
 2001            self.conn.execute(sql_create_table_index)
 2002            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
 2003            self.conn.execute(sql_create_table_index)
 2004            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
 2005            self.conn.execute(sql_create_table_index)
 2006            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
 2007            self.conn.execute(sql_create_table_index)
 2008            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
 2009            self.conn.execute(sql_create_table_index)
 2010            for field in self.index_additionnal_fields:
 2011                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
 2012                self.conn.execute(sql_create_table_index)
 2013
 2014    def drop_indexes(self) -> None:
 2015        """
 2016        Create indexes on the table after insertion
 2017        """
 2018
 2019        # Access
 2020        access = self.get_config().get("access", None)
 2021
 2022        # get table variants
 2023        table_variants = self.get_table_variants("FROM")
 2024
 2025        # Get database format
 2026        connexion_format = self.get_connexion_format()
 2027
 2028        if access not in ["RO"]:
 2029            if connexion_format in ["duckdb"]:
 2030                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
 2031            elif connexion_format in ["sqlite"]:
 2032                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
 2033
 2034            list_indexes = self.conn.execute(sql_list_indexes)
 2035            index_names = [row[0] for row in list_indexes.fetchall()]
 2036            for index in index_names:
 2037                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
 2038                self.conn.execute(sql_drop_table_index)
 2039
 2040    def read_vcf_header(self, f) -> list:
 2041        """
 2042        It reads the header of a VCF file and returns a list of the header lines
 2043
 2044        :param f: the file object
 2045        :return: The header lines of the VCF file.
 2046        """
 2047
 2048        header_list = []
 2049        for line in f:
 2050            header_list.append(line)
 2051            if line.startswith("#CHROM"):
 2052                break
 2053        return header_list
 2054
 2055    def read_vcf_header_file(self, file: str = None) -> list:
 2056        """
 2057        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
 2058        uncompressed files.
 2059
 2060        :param file: The `file` parameter is a string that represents the path to the VCF header file
 2061        that you want to read. It is an optional parameter, so if you don't provide a value, it will
 2062        default to `None`
 2063        :type file: str
 2064        :return: The function `read_vcf_header_file` returns a list.
 2065        """
 2066
 2067        if self.get_input_compressed(input_file=file):
 2068            with bgzf.open(file, "rt") as f:
 2069                return self.read_vcf_header(f=f)
 2070        else:
 2071            with open(file, "rt") as f:
 2072                return self.read_vcf_header(f=f)
 2073
 2074    def execute_query(self, query: str):
 2075        """
 2076        It takes a query as an argument, executes it, and returns the results
 2077
 2078        :param query: The query to be executed
 2079        :return: The result of the query is being returned.
 2080        """
 2081        if query:
 2082            return self.conn.execute(query)  # .fetchall()
 2083        else:
 2084            return None
 2085
 2086    def export_output(
 2087        self,
 2088        output_file: str | None = None,
 2089        output_header: str | None = None,
 2090        export_header: bool = True,
 2091        query: str | None = None,
 2092        parquet_partitions: list | None = None,
 2093        chunk_size: int | None = None,
 2094        threads: int | None = None,
 2095        sort: bool = False,
 2096        index: bool = False,
 2097        order_by: str | None = None,
 2098        fields_to_rename: dict | None = None
 2099    ) -> bool:
 2100        """
 2101        The `export_output` function exports data from a VCF file to various formats, including VCF,
 2102        CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and
 2103        partitioning.
 2104        
 2105        :param output_file: The `output_file` parameter is a string that specifies the name of the
 2106        output file where the exported data will be saved
 2107        :type output_file: str | None
 2108        :param output_header: The `output_header` parameter is a string that specifies the name of the
 2109        file where the header of the VCF file will be exported. If this parameter is not provided, the
 2110        header will be exported to a file with the same name as the `output_file` parameter, but with
 2111        the extension "
 2112        :type output_header: str | None
 2113        :param export_header: The `export_header` parameter is a boolean flag that determines whether
 2114        the header of a VCF file should be exported to a separate file or not. If `export_header` is
 2115        True, the header will be exported to a file. If `export_header` is False, the header will not
 2116        be, defaults to True
 2117        :type export_header: bool (optional)
 2118        :param query: The `query` parameter in the `export_output` function is an optional SQL query
 2119        that can be used to filter and select specific data from the VCF file before exporting it. If
 2120        provided, only the data that matches the query will be exported. This allows you to customize
 2121        the exported data based on
 2122        :type query: str | None
 2123        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
 2124        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
 2125        organize data in a hierarchical directory structure based on the values of one or more columns.
 2126        This can improve query performance when working with large datasets
 2127        :type parquet_partitions: list | None
 2128        :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when
 2129        exporting data in Parquet format. This parameter is used for partitioning the Parquet file into
 2130        multiple files. It helps in optimizing the export process by breaking down the data into
 2131        manageable chunks for processing and storage
 2132        :type chunk_size: int | None
 2133        :param threads: The `threads` parameter in the `export_output` function specifies the number of
 2134        threads to be used during the export process. It determines the level of parallelism and can
 2135        improve the performance of the export operation. If this parameter is not provided, the function
 2136        will use the default number of threads
 2137        :type threads: int | None
 2138        :param sort: The `sort` parameter in the `export_output` function is a boolean flag that
 2139        determines whether the output file should be sorted based on genomic coordinates of the
 2140        variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to
 2141        `False`,, defaults to False
 2142        :type sort: bool (optional)
 2143        :param index: The `index` parameter in the `export_output` function is a boolean flag that
 2144        determines whether an index should be created on the output file. If `index` is set to `True`,
 2145        an index will be created on the output file. If `index` is set to `False`, no, defaults to False
 2146        :type index: bool (optional)
 2147        :param order_by: The `order_by` parameter in the `export_output` function is a string that
 2148        specifies the column(s) to use for sorting the output file. This parameter is only applicable
 2149        when exporting data in VCF format. It allows you to specify the column(s) based on which the
 2150        output file should be
 2151        :type order_by: str | None
 2152        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the
 2153        mapping of field names to be renamed during the export process. This parameter allows you to
 2154        customize the output field names before exporting the data. Each key-value pair in the
 2155        dictionary represents the original field name as the key and the new field name
 2156        :type fields_to_rename: dict | None
 2157        :return: The `export_output` function returns a boolean value. It checks if the output file
 2158        exists and returns True if it does, or None if it doesn't.
 2159        """
 2160
 2161        # Log
 2162        log.info("Exporting...")
 2163
 2164        # Full path
 2165        output_file = full_path(output_file)
 2166        output_header = full_path(output_header)
 2167
 2168        # Config
 2169        config = self.get_config()
 2170
 2171        # Param
 2172        param = self.get_param()
 2173
 2174        # Tmp files to remove
 2175        tmp_to_remove = []
 2176
 2177        # If no output, get it
 2178        if not output_file:
 2179            output_file = self.get_output()
 2180
 2181        # If not threads
 2182        if not threads:
 2183            threads = self.get_threads()
 2184
 2185        # Rename fields
 2186        if not fields_to_rename:
 2187            fields_to_rename = param.get("export", {}).get("fields_to_rename", None)
 2188        self.rename_info_fields(fields_to_rename=fields_to_rename)
 2189
 2190        # Auto header name with extension
 2191        if export_header or output_header:
 2192            if not output_header:
 2193                output_header = f"{output_file}.hdr"
 2194            # Export header
 2195            self.export_header(output_file=output_file)
 2196
 2197        # Switch off export header if VCF output
 2198        output_file_type = get_file_format(output_file)
 2199        if output_file_type in ["vcf"]:
 2200            export_header = False
 2201            tmp_to_remove.append(output_header)
 2202
 2203        # Chunk size
 2204        if not chunk_size:
 2205            chunk_size = config.get("chunk_size", None)
 2206
 2207        # Parquet partition
 2208        if not parquet_partitions:
 2209            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
 2210        if parquet_partitions and isinstance(parquet_partitions, str):
 2211            parquet_partitions = parquet_partitions.split(",")
 2212
 2213        # Order by
 2214        if not order_by:
 2215            order_by = param.get("export", {}).get("order_by", "")
 2216
 2217        # Header in output
 2218        header_in_output = param.get("export", {}).get("include_header", False)
 2219
 2220        # Database
 2221        database_source = self.get_connexion()
 2222
 2223        # Connexion format
 2224        connexion_format = self.get_connexion_format()
 2225
 2226        # Explode infos
 2227        if self.get_explode_infos():
 2228            self.explode_infos(
 2229                prefix=self.get_explode_infos_prefix(),
 2230                fields=self.get_explode_infos_fields(),
 2231                force=False,
 2232            )
 2233
 2234        # if connexion_format in ["sqlite"] or query:
 2235        if connexion_format in ["sqlite"]:
 2236
 2237            # Export in Parquet
 2238            random_tmp = "".join(
 2239                random.choice(string.ascii_lowercase) for i in range(10)
 2240            )
 2241            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
 2242            tmp_to_remove.append(database_source)
 2243
 2244            # Table Variants
 2245            table_variants = self.get_table_variants()
 2246
 2247            # Create export query
 2248            sql_query_export_subquery = f"""
 2249                SELECT * FROM {table_variants}
 2250                """
 2251
 2252            # Write source file
 2253            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
 2254
 2255        # Create database
 2256        database = Database(
 2257            database=database_source,
 2258            table="variants",
 2259            header_file=output_header,
 2260            conn_config=self.get_connexion_config(),
 2261        )
 2262
 2263        # Existing colomns header
 2264        existing_columns_header = database.get_header_columns_from_database(query=query)
 2265
 2266        # Sample list
 2267        if output_file_type in ["vcf"]:
 2268            get_samples = self.get_samples()
 2269            get_samples_check = self.get_samples_check()
 2270            samples_force = get_samples is not None
 2271            sample_list = self.get_header_sample_list(
 2272                check=get_samples_check,
 2273                samples=get_samples,
 2274                samples_force=samples_force,
 2275            )
 2276        else:
 2277            sample_list = None
 2278
 2279        # Export file
 2280        database.export(
 2281            output_database=output_file,
 2282            output_header=output_header,
 2283            existing_columns_header=existing_columns_header,
 2284            parquet_partitions=parquet_partitions,
 2285            chunk_size=chunk_size,
 2286            threads=threads,
 2287            sort=sort,
 2288            index=index,
 2289            header_in_output=header_in_output,
 2290            order_by=order_by,
 2291            query=query,
 2292            export_header=export_header,
 2293            sample_list=sample_list,
 2294        )
 2295
 2296        # Remove
 2297        remove_if_exists(tmp_to_remove)
 2298
 2299        return (os.path.exists(output_file) or None) and (
 2300            os.path.exists(output_file) or None
 2301        )
 2302
 2303    def get_extra_infos(self, table: str = None) -> list:
 2304        """
 2305        The `get_extra_infos` function returns a list of columns that are in a specified table but not
 2306        in the header.
 2307
 2308        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
 2309        name of the table from which you want to retrieve the extra columns that are not present in the
 2310        header. If the `table` parameter is not provided when calling the function, it will default to
 2311        using the variants
 2312        :type table: str
 2313        :return: A list of columns that are in the specified table but not in the header of the table.
 2314        """
 2315
 2316        header_columns = []
 2317
 2318        if not table:
 2319            table = self.get_table_variants(clause="from")
 2320            header_columns = self.get_header_columns()
 2321
 2322        # Check all columns in the database
 2323        query = f""" SELECT * FROM {table} LIMIT 1 """
 2324        log.debug(f"query {query}")
 2325        table_columns = self.get_query_to_df(query).columns.tolist()
 2326        extra_columns = []
 2327
 2328        # Construct extra infos (not in header)
 2329        for column in table_columns:
 2330            if column not in header_columns:
 2331                extra_columns.append(column)
 2332
 2333        return extra_columns
 2334
 2335    def get_extra_infos_sql(self, table: str = None) -> str:
 2336        """
 2337        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
 2338        by double quotes
 2339
 2340        :param table: The name of the table to get the extra infos from. If None, the default table is
 2341        used
 2342        :type table: str
 2343        :return: A string of the extra infos
 2344        """
 2345
 2346        return ", ".join(
 2347            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
 2348        )
 2349
 2350    def export_header(
 2351        self,
 2352        header_name: str = None,
 2353        output_file: str = None,
 2354        output_file_ext: str = ".hdr",
 2355        clean_header: bool = True,
 2356        remove_chrom_line: bool = False,
 2357    ) -> str:
 2358        """
 2359        The `export_header` function takes a VCF file, extracts the header, modifies it according to
 2360        specified options, and writes it to a new file.
 2361
 2362        :param header_name: The `header_name` parameter is the name of the header file to be created. If
 2363        this parameter is not specified, the header will be written to the output file
 2364        :type header_name: str
 2365        :param output_file: The `output_file` parameter in the `export_header` function is used to
 2366        specify the name of the output file where the header will be written. If this parameter is not
 2367        provided, the header will be written to a temporary file
 2368        :type output_file: str
 2369        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
 2370        string that represents the extension of the output header file. By default, it is set to ".hdr"
 2371        if not specified by the user. This extension will be appended to the `output_file` name to
 2372        create the final, defaults to .hdr
 2373        :type output_file_ext: str (optional)
 2374        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
 2375        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
 2376        `True`, the function will clean the header by modifying certain lines based on a specific
 2377        pattern. If `clean_header`, defaults to True
 2378        :type clean_header: bool (optional)
 2379        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
 2380        boolean flag that determines whether the #CHROM line should be removed from the header before
 2381        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
 2382        defaults to False
 2383        :type remove_chrom_line: bool (optional)
 2384        :return: The function `export_header` returns the name of the temporary header file that is
 2385        created.
 2386        """
 2387
 2388        if not header_name and not output_file:
 2389            output_file = self.get_output()
 2390
 2391        if self.get_header():
 2392
 2393            # Get header object
 2394            header_obj = self.get_header()
 2395
 2396            # Create database
 2397            db_for_header = Database(database=self.get_input())
 2398
 2399            # Get real columns in the file
 2400            db_header_columns = db_for_header.get_columns()
 2401
 2402            with tempfile.TemporaryDirectory() as tmpdir:
 2403
 2404                # Write header file
 2405                header_file_tmp = os.path.join(tmpdir, "header")
 2406                f = open(header_file_tmp, "w")
 2407                vcf.Writer(f, header_obj)
 2408                f.close()
 2409
 2410                # Replace #CHROM line with rel columns
 2411                header_list = db_for_header.read_header_file(
 2412                    header_file=header_file_tmp
 2413                )
 2414                header_list[-1] = "\t".join(db_header_columns)
 2415
 2416                # Remove CHROM line
 2417                if remove_chrom_line:
 2418                    header_list.pop()
 2419
 2420                # Clean header
 2421                if clean_header:
 2422                    header_list_clean = []
 2423                    for head in header_list:
 2424                        # Clean head for malformed header
 2425                        head_clean = head
 2426                        head_clean = re.subn(
 2427                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
 2428                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
 2429                            head_clean,
 2430                            2,
 2431                        )[0]
 2432                        # Write header
 2433                        header_list_clean.append(head_clean)
 2434                    header_list = header_list_clean
 2435
 2436            tmp_header_name = output_file + output_file_ext
 2437
 2438            f = open(tmp_header_name, "w")
 2439            for line in header_list:
 2440                f.write(line)
 2441            f.close()
 2442
 2443        return tmp_header_name
 2444
 2445    def export_variant_vcf(
 2446        self,
 2447        vcf_file,
 2448        remove_info: bool = False,
 2449        add_samples: bool = True,
 2450        list_samples: list = [],
 2451        where_clause: str = "",
 2452        index: bool = False,
 2453        threads: int | None = None,
 2454    ) -> bool | None:
 2455        """
 2456        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
 2457        remove INFO field, add samples, and control compression and indexing.
 2458
 2459        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
 2460        written to. It is the output file that will contain the filtered VCF data based on the specified
 2461        parameters
 2462        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
 2463        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
 2464        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
 2465        in, defaults to False
 2466        :type remove_info: bool (optional)
 2467        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
 2468        the samples should be added to the VCF file or not. If set to True, the samples will be added.
 2469        If set to False, the samples will be removed. The default value is True, defaults to True
 2470        :type add_samples: bool (optional)
 2471        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
 2472        in the output VCF file. By default, all samples will be included. If you provide a list of
 2473        samples, only those samples will be included in the output file
 2474        :type list_samples: list
 2475        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
 2476        determines whether or not to create an index for the output VCF file. If `index` is set to
 2477        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
 2478        :type index: bool (optional)
 2479        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
 2480        number of threads to use for exporting the VCF file. It determines how many parallel threads
 2481        will be used during the export process. More threads can potentially speed up the export process
 2482        by utilizing multiple cores of the processor. If
 2483        :type threads: int | None
 2484        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
 2485        method with various parameters including the output file, query, threads, sort flag, and index
 2486        flag. The `export_output` method is responsible for exporting the VCF data based on the
 2487        specified parameters and configurations provided in the `export_variant_vcf` function.
 2488        """
 2489
 2490        # Config
 2491        config = self.get_config()
 2492
 2493        # Extract VCF
 2494        log.debug("Export VCF...")
 2495
 2496        # Table variants
 2497        table_variants = self.get_table_variants()
 2498
 2499        # Threads
 2500        if not threads:
 2501            threads = self.get_threads()
 2502
 2503        # Info fields
 2504        if remove_info:
 2505            if not isinstance(remove_info, str):
 2506                remove_info = "."
 2507            info_field = f"""'{remove_info}' as INFO"""
 2508        else:
 2509            info_field = "INFO"
 2510
 2511        # Samples fields
 2512        if add_samples:
 2513            if not list_samples:
 2514                list_samples = self.get_header_sample_list()
 2515            if list_samples:
 2516                samples_fields = " , FORMAT , " + " , ".join(list_samples)
 2517            else:
 2518                samples_fields = ""
 2519            log.debug(f"samples_fields: {samples_fields}")
 2520        else:
 2521            samples_fields = ""
 2522
 2523        # Where clause
 2524        if where_clause is None:
 2525            where_clause = ""
 2526
 2527        # Variants
 2528        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
 2529        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
 2530        log.debug(f"sql_query_select={sql_query_select}")
 2531
 2532        return self.export_output(
 2533            output_file=vcf_file,
 2534            output_header=None,
 2535            export_header=True,
 2536            query=sql_query_select,
 2537            parquet_partitions=None,
 2538            chunk_size=config.get("chunk_size", None),
 2539            threads=threads,
 2540            sort=True,
 2541            index=index,
 2542            order_by=None,
 2543        )
 2544
 2545    def run_commands(self, commands: list = [], threads: int = 1) -> None:
 2546        """
 2547        It takes a list of commands and runs them in parallel using the number of threads specified
 2548
 2549        :param commands: A list of commands to run
 2550        :param threads: The number of threads to use, defaults to 1 (optional)
 2551        """
 2552
 2553        run_parallel_commands(commands, threads)
 2554
 2555    def get_threads(self, default: int = 1) -> int:
 2556        """
 2557        This function returns the number of threads to use for a job, with a default value of 1 if not
 2558        specified.
 2559
 2560        :param default: The `default` parameter in the `get_threads` method is used to specify the
 2561        default number of threads to use if no specific value is provided. If no value is provided for
 2562        the `threads` parameter in the configuration or input parameters, the `default` value will be
 2563        used, defaults to 1
 2564        :type default: int (optional)
 2565        :return: the number of threads to use for the current job.
 2566        """
 2567
 2568        # Config
 2569        config = self.get_config()
 2570
 2571        # Param
 2572        param = self.get_param()
 2573
 2574        # Input threads
 2575        input_thread = param.get("threads", config.get("threads", None))
 2576
 2577        # Check threads
 2578        if not input_thread:
 2579            threads = default
 2580        elif int(input_thread) <= 0:
 2581            threads = os.cpu_count()
 2582        else:
 2583            threads = int(input_thread)
 2584        return threads
 2585
 2586    def get_memory(self, default: str = None) -> str:
 2587        """
 2588        This function retrieves the memory value from parameters or configuration with a default value
 2589        if not found.
 2590
 2591        :param default: The `get_memory` function takes in a default value as a string parameter. This
 2592        default value is used as a fallback in case the `memory` parameter is not provided in the
 2593        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
 2594        the function
 2595        :type default: str
 2596        :return: The `get_memory` function returns a string value representing the memory parameter. If
 2597        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
 2598        return the default value provided as an argument to the function.
 2599        """
 2600
 2601        # Config
 2602        config = self.get_config()
 2603
 2604        # Param
 2605        param = self.get_param()
 2606
 2607        # Input threads
 2608        input_memory = param.get("memory", config.get("memory", None))
 2609
 2610        # Check threads
 2611        if input_memory:
 2612            memory = input_memory
 2613        else:
 2614            memory = default
 2615
 2616        return memory
 2617
 2618    def update_from_vcf(self, vcf_file: str) -> None:
 2619        """
 2620        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
 2621
 2622        :param vcf_file: the path to the VCF file
 2623        """
 2624
 2625        connexion_format = self.get_connexion_format()
 2626
 2627        if connexion_format in ["duckdb"]:
 2628            self.update_from_vcf_duckdb(vcf_file)
 2629        elif connexion_format in ["sqlite"]:
 2630            self.update_from_vcf_sqlite(vcf_file)
 2631
 2632    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
 2633        """
 2634        It takes a VCF file and updates the INFO column of the variants table in the database with the
 2635        INFO column of the VCF file
 2636
 2637        :param vcf_file: the path to the VCF file
 2638        """
 2639
 2640        # varaints table
 2641        table_variants = self.get_table_variants()
 2642
 2643        # Loading VCF into temporaire table
 2644        skip = self.get_header_length(file=vcf_file)
 2645        vcf_df = pd.read_csv(
 2646            vcf_file,
 2647            sep="\t",
 2648            engine="c",
 2649            skiprows=skip,
 2650            header=0,
 2651            low_memory=False,
 2652        )
 2653        sql_query_update = f"""
 2654        UPDATE {table_variants} as table_variants
 2655            SET INFO = concat(
 2656                            CASE
 2657                                WHEN INFO NOT IN ('', '.')
 2658                                THEN INFO
 2659                                ELSE ''
 2660                            END,
 2661                            (
 2662                                SELECT 
 2663                                    concat(
 2664                                        CASE
 2665                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
 2666                                            THEN ';'
 2667                                            ELSE ''
 2668                                        END
 2669                                        ,
 2670                                        CASE
 2671                                            WHEN table_parquet.INFO NOT IN ('','.')
 2672                                            THEN table_parquet.INFO
 2673                                            ELSE ''
 2674                                        END
 2675                                    )
 2676                                FROM vcf_df as table_parquet
 2677                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
 2678                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
 2679                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 2680                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
 2681                                        AND table_parquet.INFO NOT IN ('','.')
 2682                            )
 2683                        )
 2684            ;
 2685            """
 2686        self.conn.execute(sql_query_update)
 2687
 2688    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
 2689        """
 2690        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
 2691        table, then updates the INFO column of the variants table with the INFO column of the temporary
 2692        table
 2693
 2694        :param vcf_file: The path to the VCF file you want to update the database with
 2695        """
 2696
 2697        # Create a temporary table for the VCF
 2698        table_vcf = "tmp_vcf"
 2699        sql_create = (
 2700            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
 2701        )
 2702        self.conn.execute(sql_create)
 2703
 2704        # Loading VCF into temporaire table
 2705        vcf_df = pd.read_csv(
 2706            vcf_file, sep="\t", comment="#", header=None, low_memory=False
 2707        )
 2708        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
 2709        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
 2710
 2711        # Update table 'variants' with VCF data
 2712        # warning: CONCAT as || operator
 2713        sql_query_update = f"""
 2714            UPDATE variants as table_variants
 2715            SET INFO = CASE
 2716                            WHEN INFO NOT IN ('', '.')
 2717                            THEN INFO
 2718                            ELSE ''
 2719                        END ||
 2720                        (
 2721                        SELECT 
 2722                            CASE 
 2723                                WHEN table_variants.INFO NOT IN ('','.') 
 2724                                    AND table_vcf.INFO NOT IN ('','.')  
 2725                                THEN ';' 
 2726                                ELSE '' 
 2727                            END || 
 2728                            CASE 
 2729                                WHEN table_vcf.INFO NOT IN ('','.') 
 2730                                THEN table_vcf.INFO 
 2731                                ELSE '' 
 2732                            END
 2733                        FROM {table_vcf} as table_vcf
 2734                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
 2735                            AND table_vcf.\"POS\" = table_variants.\"POS\"
 2736                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
 2737                            AND table_vcf.\"REF\" = table_variants.\"REF\"
 2738                        )
 2739        """
 2740        self.conn.execute(sql_query_update)
 2741
 2742        # Drop temporary table
 2743        sql_drop = f"DROP TABLE {table_vcf}"
 2744        self.conn.execute(sql_drop)
 2745
 2746    def drop_variants_table(self) -> None:
 2747        """
 2748        > This function drops the variants table
 2749        """
 2750
 2751        table_variants = self.get_table_variants()
 2752        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
 2753        self.conn.execute(sql_table_variants)
 2754
 2755    def set_variant_id(
 2756        self, variant_id_column: str = "variant_id", force: bool = None
 2757    ) -> str:
 2758        """
 2759        It adds a column to the variants table called `variant_id` and populates it with a hash of the
 2760        `#CHROM`, `POS`, `REF`, and `ALT` columns
 2761
 2762        :param variant_id_column: The name of the column to be created in the variants table, defaults
 2763        to variant_id
 2764        :type variant_id_column: str (optional)
 2765        :param force: If True, the variant_id column will be created even if it already exists
 2766        :type force: bool
 2767        :return: The name of the column that contains the variant_id
 2768        """
 2769
 2770        # Assembly
 2771        assembly = self.get_param().get(
 2772            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 2773        )
 2774
 2775        # INFO/Tag prefix
 2776        prefix = self.get_explode_infos_prefix()
 2777
 2778        # Explode INFO/SVTYPE
 2779        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
 2780
 2781        # variants table
 2782        table_variants = self.get_table_variants()
 2783
 2784        # variant_id column
 2785        if not variant_id_column:
 2786            variant_id_column = "variant_id"
 2787
 2788        # Creta variant_id column
 2789        if "variant_id" not in self.get_extra_infos() or force:
 2790
 2791            # Create column
 2792            self.add_column(
 2793                table_name=table_variants,
 2794                column_name=variant_id_column,
 2795                column_type="UBIGINT",
 2796                default_value="0",
 2797            )
 2798
 2799            # Update column
 2800            self.conn.execute(
 2801                f"""
 2802                    UPDATE {table_variants}
 2803                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
 2804                """
 2805            )
 2806
 2807        # Remove added columns
 2808        for added_column in added_columns:
 2809            self.drop_column(column=added_column)
 2810
 2811        # return variant_id column name
 2812        return variant_id_column
 2813
 2814    def get_variant_id_column(
 2815        self, variant_id_column: str = "variant_id", force: bool = None
 2816    ) -> str:
 2817        """
 2818        This function returns the variant_id column name
 2819
 2820        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
 2821        defaults to variant_id
 2822        :type variant_id_column: str (optional)
 2823        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
 2824        False, will only set the variant_id if it is not already set. If None, will set the variant_id
 2825        if it is not already set, or if it is set
 2826        :type force: bool
 2827        :return: The variant_id column name.
 2828        """
 2829
 2830        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
 2831
 2832    ###
 2833    # Annotation
 2834    ###
 2835
 2836    def scan_databases(
 2837        self,
 2838        database_formats: list = ["parquet"],
 2839        database_releases: list = ["current"],
 2840    ) -> dict:
 2841        """
 2842        The function `scan_databases` scans for available databases based on specified formats and
 2843        releases.
 2844
 2845        :param database_formats: The `database_formats` parameter is a list that specifies the formats
 2846        of the databases to be scanned. In this case, the accepted format is "parquet"
 2847        :type database_formats: list ["parquet"]
 2848        :param database_releases: The `database_releases` parameter is a list that specifies the
 2849        releases of the databases to be scanned. In the provided function, the default value for
 2850        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
 2851        databases that are in the "current"
 2852        :type database_releases: list
 2853        :return: The function `scan_databases` returns a dictionary containing information about
 2854        databases that match the specified formats and releases.
 2855        """
 2856
 2857        # Config
 2858        config = self.get_config()
 2859
 2860        # Param
 2861        param = self.get_param()
 2862
 2863        # Param - Assembly
 2864        assembly = param.get("assembly", config.get("assembly", None))
 2865        if not assembly:
 2866            assembly = DEFAULT_ASSEMBLY
 2867            log.warning(f"Default assembly '{assembly}'")
 2868
 2869        # Scan for availabled databases
 2870        log.info(
 2871            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
 2872        )
 2873        databases_infos_dict = databases_infos(
 2874            database_folder_releases=database_releases,
 2875            database_formats=database_formats,
 2876            assembly=assembly,
 2877            config=config,
 2878        )
 2879        log.info(
 2880            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
 2881        )
 2882
 2883        return databases_infos_dict
 2884
 2885    def annotation(self) -> None:
 2886        """
 2887        It annotates the VCF file with the annotations specified in the config file.
 2888        """
 2889
 2890        # Config
 2891        config = self.get_config()
 2892
 2893        # Param
 2894        param = self.get_param()
 2895
 2896        # Param - Assembly
 2897        assembly = param.get("assembly", config.get("assembly", None))
 2898        if not assembly:
 2899            assembly = DEFAULT_ASSEMBLY
 2900            log.warning(f"Default assembly '{assembly}'")
 2901
 2902        # annotations databases folders
 2903        annotations_databases = set(
 2904            config.get("folders", {})
 2905            .get("databases", {})
 2906            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
 2907            + config.get("folders", {})
 2908            .get("databases", {})
 2909            .get("parquet", ["~/howard/databases/parquet/current"])
 2910            + config.get("folders", {})
 2911            .get("databases", {})
 2912            .get("bcftools", ["~/howard/databases/bcftools/current"])
 2913        )
 2914
 2915        # Get param annotations
 2916        if param.get("annotations", None) and isinstance(
 2917            param.get("annotations", None), str
 2918        ):
 2919            log.debug(param.get("annotations", None))
 2920            param_annotation_list = param.get("annotations").split(",")
 2921        else:
 2922            param_annotation_list = []
 2923
 2924        # Each tools param
 2925        if param.get("annotation_parquet", None) != None:
 2926            log.debug(
 2927                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
 2928            )
 2929            if isinstance(param.get("annotation_parquet", None), list):
 2930                param_annotation_list.append(",".join(param.get("annotation_parquet")))
 2931            else:
 2932                param_annotation_list.append(param.get("annotation_parquet"))
 2933        if param.get("annotation_snpsift", None) != None:
 2934            if isinstance(param.get("annotation_snpsift", None), list):
 2935                param_annotation_list.append(
 2936                    "snpsift:"
 2937                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
 2938                )
 2939            else:
 2940                param_annotation_list.append(
 2941                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
 2942                )
 2943        if param.get("annotation_snpeff", None) != None:
 2944            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
 2945        if param.get("annotation_bcftools", None) != None:
 2946            if isinstance(param.get("annotation_bcftools", None), list):
 2947                param_annotation_list.append(
 2948                    "bcftools:"
 2949                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
 2950                )
 2951            else:
 2952                param_annotation_list.append(
 2953                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
 2954                )
 2955        if param.get("annotation_annovar", None) != None:
 2956            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
 2957        if param.get("annotation_exomiser", None) != None:
 2958            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
 2959        if param.get("annotation_splice", None) != None:
 2960            param_annotation_list.append("splice:" + param.get("annotation_splice"))
 2961
 2962        # Merge param annotations list
 2963        param["annotations"] = ",".join(param_annotation_list)
 2964
 2965        # debug
 2966        log.debug(f"param_annotations={param['annotations']}")
 2967
 2968        if param.get("annotations"):
 2969
 2970            # Log
 2971            # log.info("Annotations - Check annotation parameters")
 2972
 2973            if not "annotation" in param:
 2974                param["annotation"] = {}
 2975
 2976            # List of annotations parameters
 2977            annotations_list_input = {}
 2978            if isinstance(param.get("annotations", None), str):
 2979                annotation_file_list = [
 2980                    value for value in param.get("annotations", "").split(",")
 2981                ]
 2982                for annotation_file in annotation_file_list:
 2983                    annotations_list_input[annotation_file.strip()] = {"INFO": None}
 2984            else:
 2985                annotations_list_input = param.get("annotations", {})
 2986
 2987            log.info(f"Quick Annotations:")
 2988            for annotation_key in list(annotations_list_input.keys()):
 2989                log.info(f"   {annotation_key}")
 2990
 2991            # List of annotations and associated fields
 2992            annotations_list = {}
 2993
 2994            for annotation_file in annotations_list_input:
 2995
 2996                # Explode annotations if ALL
 2997                if (
 2998                    annotation_file.upper() == "ALL"
 2999                    or annotation_file.upper().startswith("ALL:")
 3000                ):
 3001
 3002                    # check ALL parameters (formats, releases)
 3003                    annotation_file_split = annotation_file.split(":")
 3004                    database_formats = "parquet"
 3005                    database_releases = "current"
 3006                    for annotation_file_option in annotation_file_split[1:]:
 3007                        database_all_options_split = annotation_file_option.split("=")
 3008                        if database_all_options_split[0] == "format":
 3009                            database_formats = database_all_options_split[1].split("+")
 3010                        if database_all_options_split[0] == "release":
 3011                            database_releases = database_all_options_split[1].split("+")
 3012
 3013                    # Scan for availabled databases
 3014                    databases_infos_dict = self.scan_databases(
 3015                        database_formats=database_formats,
 3016                        database_releases=database_releases,
 3017                    )
 3018
 3019                    # Add found databases in annotation parameters
 3020                    for database_infos in databases_infos_dict.keys():
 3021                        annotations_list[database_infos] = {"INFO": None}
 3022
 3023                else:
 3024                    annotations_list[annotation_file] = annotations_list_input[
 3025                        annotation_file
 3026                    ]
 3027
 3028            # Check each databases
 3029            if len(annotations_list):
 3030
 3031                log.info(
 3032                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
 3033                )
 3034
 3035                for annotation_file in annotations_list:
 3036
 3037                    # Init
 3038                    annotations = annotations_list.get(annotation_file, None)
 3039
 3040                    # Annotation snpEff
 3041                    if annotation_file.startswith("snpeff"):
 3042
 3043                        log.debug(f"Quick Annotation snpEff")
 3044
 3045                        if "snpeff" not in param["annotation"]:
 3046                            param["annotation"]["snpeff"] = {}
 3047
 3048                        if "options" not in param["annotation"]["snpeff"]:
 3049                            param["annotation"]["snpeff"]["options"] = ""
 3050
 3051                        # snpEff options in annotations
 3052                        param["annotation"]["snpeff"]["options"] = "".join(
 3053                            annotation_file.split(":")[1:]
 3054                        )
 3055
 3056                    # Annotation Annovar
 3057                    elif annotation_file.startswith("annovar"):
 3058
 3059                        log.debug(f"Quick Annotation Annovar")
 3060
 3061                        if "annovar" not in param["annotation"]:
 3062                            param["annotation"]["annovar"] = {}
 3063
 3064                        if "annotations" not in param["annotation"]["annovar"]:
 3065                            param["annotation"]["annovar"]["annotations"] = {}
 3066
 3067                        # Options
 3068                        annotation_file_split = annotation_file.split(":")
 3069                        for annotation_file_annotation in annotation_file_split[1:]:
 3070                            if annotation_file_annotation:
 3071                                param["annotation"]["annovar"]["annotations"][
 3072                                    annotation_file_annotation
 3073                                ] = annotations
 3074
 3075                    # Annotation Exomiser
 3076                    elif annotation_file.startswith("exomiser"):
 3077
 3078                        log.debug(f"Quick Annotation Exomiser")
 3079
 3080                        param["annotation"]["exomiser"] = params_string_to_dict(
 3081                            annotation_file
 3082                        )
 3083
 3084                    # Annotation Splice
 3085                    elif annotation_file.startswith("splice"):
 3086
 3087                        log.debug(f"Quick Annotation Splice")
 3088
 3089                        param["annotation"]["splice"] = params_string_to_dict(
 3090                            annotation_file
 3091                        )
 3092
 3093                    # Annotation Parquet or BCFTOOLS
 3094                    else:
 3095
 3096                        # Tools detection
 3097                        if annotation_file.startswith("bcftools:"):
 3098                            annotation_tool_initial = "bcftools"
 3099                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3100                        elif annotation_file.startswith("snpsift:"):
 3101                            annotation_tool_initial = "snpsift"
 3102                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3103                        elif annotation_file.startswith("bigwig:"):
 3104                            annotation_tool_initial = "bigwig"
 3105                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3106                        else:
 3107                            annotation_tool_initial = None
 3108
 3109                        # list of files
 3110                        annotation_file_list = annotation_file.replace("+", ":").split(
 3111                            ":"
 3112                        )
 3113
 3114                        for annotation_file in annotation_file_list:
 3115
 3116                            if annotation_file:
 3117
 3118                                # Annotation tool initial
 3119                                annotation_tool = annotation_tool_initial
 3120
 3121                                # Find file
 3122                                annotation_file_found = None
 3123
 3124                                if os.path.exists(annotation_file):
 3125                                    annotation_file_found = annotation_file
 3126                                elif os.path.exists(full_path(annotation_file)):
 3127                                    annotation_file_found = full_path(annotation_file)
 3128                                else:
 3129                                    # Find within assembly folders
 3130                                    for annotations_database in annotations_databases:
 3131                                        found_files = find_all(
 3132                                            annotation_file,
 3133                                            os.path.join(
 3134                                                annotations_database, assembly
 3135                                            ),
 3136                                        )
 3137                                        if len(found_files) > 0:
 3138                                            annotation_file_found = found_files[0]
 3139                                            break
 3140                                    if not annotation_file_found and not assembly:
 3141                                        # Find within folders
 3142                                        for (
 3143                                            annotations_database
 3144                                        ) in annotations_databases:
 3145                                            found_files = find_all(
 3146                                                annotation_file, annotations_database
 3147                                            )
 3148                                            if len(found_files) > 0:
 3149                                                annotation_file_found = found_files[0]
 3150                                                break
 3151                                log.debug(
 3152                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
 3153                                )
 3154
 3155                                # Full path
 3156                                annotation_file_found = full_path(annotation_file_found)
 3157
 3158                                if annotation_file_found:
 3159
 3160                                    database = Database(database=annotation_file_found)
 3161                                    quick_annotation_format = database.get_format()
 3162                                    quick_annotation_is_compressed = (
 3163                                        database.is_compressed()
 3164                                    )
 3165                                    quick_annotation_is_indexed = os.path.exists(
 3166                                        f"{annotation_file_found}.tbi"
 3167                                    )
 3168                                    bcftools_preference = False
 3169
 3170                                    # Check Annotation Tool
 3171                                    if not annotation_tool:
 3172                                        if (
 3173                                            bcftools_preference
 3174                                            and quick_annotation_format
 3175                                            in ["vcf", "bed"]
 3176                                            and quick_annotation_is_compressed
 3177                                            and quick_annotation_is_indexed
 3178                                        ):
 3179                                            annotation_tool = "bcftools"
 3180                                        elif quick_annotation_format in [
 3181                                            "vcf",
 3182                                            "bed",
 3183                                            "tsv",
 3184                                            "tsv",
 3185                                            "csv",
 3186                                            "json",
 3187                                            "tbl",
 3188                                            "parquet",
 3189                                            "duckdb",
 3190                                        ]:
 3191                                            annotation_tool = "parquet"
 3192                                        elif quick_annotation_format in ["bw"]:
 3193                                            annotation_tool = "bigwig"
 3194                                        else:
 3195                                            log.error(
 3196                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3197                                            )
 3198                                            raise ValueError(
 3199                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3200                                            )
 3201
 3202                                    log.debug(
 3203                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
 3204                                    )
 3205
 3206                                    # Annotation Tool dispatch
 3207                                    if annotation_tool:
 3208                                        if annotation_tool not in param["annotation"]:
 3209                                            param["annotation"][annotation_tool] = {}
 3210                                        if (
 3211                                            "annotations"
 3212                                            not in param["annotation"][annotation_tool]
 3213                                        ):
 3214                                            param["annotation"][annotation_tool][
 3215                                                "annotations"
 3216                                            ] = {}
 3217                                        param["annotation"][annotation_tool][
 3218                                            "annotations"
 3219                                        ][annotation_file_found] = annotations
 3220
 3221                                else:
 3222                                    log.warning(
 3223                                        f"Quick Annotation File {annotation_file} does NOT exist"
 3224                                    )
 3225
 3226                self.set_param(param)
 3227
 3228        if param.get("annotation", None):
 3229            log.info("Annotations")
 3230            if param.get("annotation", {}).get("parquet", None):
 3231                log.info("Annotations 'parquet'...")
 3232                self.annotation_parquet()
 3233            if param.get("annotation", {}).get("bcftools", None):
 3234                log.info("Annotations 'bcftools'...")
 3235                self.annotation_bcftools()
 3236            if param.get("annotation", {}).get("snpsift", None):
 3237                log.info("Annotations 'snpsift'...")
 3238                self.annotation_snpsift()
 3239            if param.get("annotation", {}).get("bigwig", None):
 3240                log.info("Annotations 'bigwig'...")
 3241                self.annotation_bigwig()
 3242            if param.get("annotation", {}).get("annovar", None):
 3243                log.info("Annotations 'annovar'...")
 3244                self.annotation_annovar()
 3245            if param.get("annotation", {}).get("snpeff", None):
 3246                log.info("Annotations 'snpeff'...")
 3247                self.annotation_snpeff()
 3248            if param.get("annotation", {}).get("exomiser", None) is not None:
 3249                log.info("Annotations 'exomiser'...")
 3250                self.annotation_exomiser()
 3251            if param.get("annotation", {}).get("splice", None) is not None:
 3252                log.info("Annotations 'splice' ...")
 3253                self.annotation_splice()
 3254
 3255        # Explode INFOS fields into table fields
 3256        if self.get_explode_infos():
 3257            self.explode_infos(
 3258                prefix=self.get_explode_infos_prefix(),
 3259                fields=self.get_explode_infos_fields(),
 3260                force=True,
 3261            )
 3262
 3263    def annotation_bigwig(self, threads: int = None) -> None:
 3264        """
 3265        The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases.
 3266
 3267        :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the
 3268        number of threads to be used for parallel processing during the annotation process. If the
 3269        `threads` parameter is not provided, the method will attempt to determine the optimal number of
 3270        threads to use based on the system configuration
 3271        :type threads: int
 3272        :return: True
 3273        """
 3274
 3275        # DEBUG
 3276        log.debug("Start annotation with bigwig databases")
 3277
 3278        # # Threads
 3279        # if not threads:
 3280        #     threads = self.get_threads()
 3281        # log.debug("Threads: " + str(threads))
 3282
 3283        # Config
 3284        config = self.get_config()
 3285        log.debug("Config: " + str(config))
 3286
 3287        # Config - BCFTools databases folders
 3288        databases_folders = set(
 3289            self.get_config()
 3290            .get("folders", {})
 3291            .get("databases", {})
 3292            .get("annotations", ["."])
 3293            + self.get_config()
 3294            .get("folders", {})
 3295            .get("databases", {})
 3296            .get("bigwig", ["."])
 3297        )
 3298        log.debug("Databases annotations: " + str(databases_folders))
 3299
 3300        # Param
 3301        annotations = (
 3302            self.get_param()
 3303            .get("annotation", {})
 3304            .get("bigwig", {})
 3305            .get("annotations", None)
 3306        )
 3307        log.debug("Annotations: " + str(annotations))
 3308
 3309        # Assembly
 3310        assembly = self.get_param().get(
 3311            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3312        )
 3313
 3314        # Data
 3315        table_variants = self.get_table_variants()
 3316
 3317        # Check if not empty
 3318        log.debug("Check if not empty")
 3319        sql_query_chromosomes = (
 3320            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3321        )
 3322        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3323        if not sql_query_chromosomes_df["count"][0]:
 3324            log.info(f"VCF empty")
 3325            return
 3326
 3327        # VCF header
 3328        vcf_reader = self.get_header()
 3329        log.debug("Initial header: " + str(vcf_reader.infos))
 3330
 3331        # Existing annotations
 3332        for vcf_annotation in self.get_header().infos:
 3333
 3334            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3335            log.debug(
 3336                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3337            )
 3338
 3339        if annotations:
 3340
 3341            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3342
 3343                # Export VCF file
 3344                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3345
 3346                # annotation_bigwig_config
 3347                annotation_bigwig_config_list = []
 3348
 3349                for annotation in annotations:
 3350                    annotation_fields = annotations[annotation]
 3351
 3352                    # Annotation Name
 3353                    annotation_name = os.path.basename(annotation)
 3354
 3355                    if not annotation_fields:
 3356                        annotation_fields = {"INFO": None}
 3357
 3358                    log.debug(f"Annotation '{annotation_name}'")
 3359                    log.debug(
 3360                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3361                    )
 3362
 3363                    # Create Database
 3364                    database = Database(
 3365                        database=annotation,
 3366                        databases_folders=databases_folders,
 3367                        assembly=assembly,
 3368                    )
 3369
 3370                    # Find files
 3371                    db_file = database.get_database()
 3372                    db_file = full_path(db_file)
 3373                    db_hdr_file = database.get_header_file()
 3374                    db_hdr_file = full_path(db_hdr_file)
 3375                    db_file_type = database.get_format()
 3376
 3377                    # If db_file is http ?
 3378                    if database.get_database().startswith("http"):
 3379
 3380                        # Datbase is HTTP URL
 3381                        db_file_is_http = True
 3382
 3383                        # DB file keep as URL
 3384                        db_file = database.get_database()
 3385                        log.warning(
 3386                            f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)"
 3387                        )
 3388
 3389                        # Retrieve automatic annotation field name
 3390                        annotation_field = clean_annotation_field(
 3391                            os.path.basename(db_file).replace(".bw", "")
 3392                        )
 3393                        log.debug(
 3394                            f"Create header file with annotation field '{annotation_field}' is an HTTP URL"
 3395                        )
 3396
 3397                        # Create automatic header file
 3398                        db_hdr_file = os.path.join(tmp_dir, "header.hdr")
 3399                        with open(db_hdr_file, "w") as f:
 3400                            f.write("##fileformat=VCFv4.2\n")
 3401                            f.write(
 3402                                f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n"""
 3403                            )
 3404                            f.write(f"#CHROM	START	END	{annotation_field}\n")
 3405
 3406                    else:
 3407
 3408                        # Datbase is NOT HTTP URL
 3409                        db_file_is_http = False
 3410
 3411                    # Check index - try to create if not exists
 3412                    if (
 3413                        db_file is None
 3414                        or db_hdr_file is None
 3415                        or (not os.path.exists(db_file) and not db_file_is_http)
 3416                        or not os.path.exists(db_hdr_file)
 3417                        or not db_file_type in ["bw"]
 3418                    ):
 3419                        # if False:
 3420                        log.error("Annotation failed: database not valid")
 3421                        log.error(f"Annotation annotation file: {db_file}")
 3422                        log.error(f"Annotation annotation file type: {db_file_type}")
 3423                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3424                        raise ValueError(
 3425                            f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}"
 3426                        )
 3427                    else:
 3428
 3429                        # Log
 3430                        log.debug(
 3431                            f"Annotation '{annotation}' - file: "
 3432                            + str(db_file)
 3433                            + " and "
 3434                            + str(db_hdr_file)
 3435                        )
 3436
 3437                        # Load header as VCF object
 3438                        db_hdr_vcf = Variants(input=db_hdr_file)
 3439                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3440                        log.debug(
 3441                            "Annotation database header: "
 3442                            + str(db_hdr_vcf_header_infos)
 3443                        )
 3444
 3445                        # For all fields in database
 3446                        annotation_fields_full = False
 3447                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3448                            annotation_fields = {
 3449                                key: key for key in db_hdr_vcf_header_infos
 3450                            }
 3451                            log.debug(
 3452                                "Annotation database header - All annotations added: "
 3453                                + str(annotation_fields)
 3454                            )
 3455                            annotation_fields_full = True
 3456
 3457                        # Init
 3458                        cyvcf2_header_rename_dict = {}
 3459                        cyvcf2_header_list = []
 3460                        cyvcf2_header_indexes = {}
 3461
 3462                        # process annotation fields
 3463                        for annotation_field in annotation_fields:
 3464
 3465                            # New annotation name
 3466                            annotation_field_new = annotation_fields[annotation_field]
 3467
 3468                            # Check annotation field and index in header
 3469                            if (
 3470                                annotation_field
 3471                                in db_hdr_vcf.get_header_columns_as_list()
 3472                            ):
 3473                                annotation_field_index = (
 3474                                    db_hdr_vcf.get_header_columns_as_list().index(
 3475                                        annotation_field
 3476                                    )
 3477                                    - 3
 3478                                )
 3479                                cyvcf2_header_indexes[annotation_field_new] = (
 3480                                    annotation_field_index
 3481                                )
 3482                            else:
 3483                                msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'"
 3484                                log.error(msg_err)
 3485                                raise ValueError(msg_err)
 3486
 3487                            # Append annotation field in cyvcf2 header list
 3488                            cyvcf2_header_rename_dict[annotation_field_new] = (
 3489                                db_hdr_vcf_header_infos[annotation_field].id
 3490                            )
 3491                            cyvcf2_header_list.append(
 3492                                {
 3493                                    "ID": annotation_field_new,
 3494                                    "Number": db_hdr_vcf_header_infos[
 3495                                        annotation_field
 3496                                    ].num,
 3497                                    "Type": db_hdr_vcf_header_infos[
 3498                                        annotation_field
 3499                                    ].type,
 3500                                    "Description": db_hdr_vcf_header_infos[
 3501                                        annotation_field
 3502                                    ].desc,
 3503                                }
 3504                            )
 3505
 3506                            # Add header on VCF
 3507                            vcf_reader.infos[annotation_field_new] = vcf.parser._Info(
 3508                                annotation_field_new,
 3509                                db_hdr_vcf_header_infos[annotation_field].num,
 3510                                db_hdr_vcf_header_infos[annotation_field].type,
 3511                                db_hdr_vcf_header_infos[annotation_field].desc,
 3512                                "HOWARD BigWig annotation",
 3513                                "unknown",
 3514                                self.code_type_map[
 3515                                    db_hdr_vcf_header_infos[annotation_field].type
 3516                                ],
 3517                            )
 3518
 3519                        # Load bigwig database
 3520                        bw_db = pyBigWig.open(db_file)
 3521                        if bw_db.isBigWig():
 3522                            log.debug(f"Database '{db_file}' is in 'BigWig' format")
 3523                        else:
 3524                            msg_err = f"Database '{db_file}' is NOT in 'BigWig' format"
 3525                            log.error(msg_err)
 3526                            raise ValueError(msg_err)
 3527
 3528                        annotation_bigwig_config_list.append(
 3529                            {
 3530                                "db_file": db_file,
 3531                                "bw_db": bw_db,
 3532                                "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict,
 3533                                "cyvcf2_header_list": cyvcf2_header_list,
 3534                                "cyvcf2_header_indexes": cyvcf2_header_indexes,
 3535                            }
 3536                        )
 3537
 3538                # Annotate
 3539                if annotation_bigwig_config_list:
 3540
 3541                    # Annotation config
 3542                    log.debug(
 3543                        f"annotation_bigwig_config={annotation_bigwig_config_list}"
 3544                    )
 3545
 3546                    # Export VCF file
 3547                    self.export_variant_vcf(
 3548                        vcf_file=tmp_vcf_name,
 3549                        remove_info=True,
 3550                        add_samples=False,
 3551                        index=True,
 3552                    )
 3553
 3554                    # Load input tmp file
 3555                    input_vcf = cyvcf2.VCF(tmp_vcf_name)
 3556
 3557                    # Add header in input file
 3558                    for annotation_bigwig_config in annotation_bigwig_config_list:
 3559                        for cyvcf2_header_field in annotation_bigwig_config.get(
 3560                            "cyvcf2_header_list", []
 3561                        ):
 3562                            log.info(
 3563                                f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'"
 3564                            )
 3565                            input_vcf.add_info_to_header(cyvcf2_header_field)
 3566
 3567                    # Create output VCF file
 3568                    output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz")
 3569                    output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf)
 3570
 3571                    # Fetch variants
 3572                    log.info(f"Annotations 'bigwig' start...")
 3573                    for variant in input_vcf:
 3574
 3575                        for annotation_bigwig_config in annotation_bigwig_config_list:
 3576
 3577                            # DB and indexes
 3578                            bw_db = annotation_bigwig_config.get("bw_db", None)
 3579                            cyvcf2_header_indexes = annotation_bigwig_config.get(
 3580                                "cyvcf2_header_indexes", None
 3581                            )
 3582
 3583                            # Retrieve value from chrom pos
 3584                            res = bw_db.values(
 3585                                variant.CHROM, variant.POS - 1, variant.POS
 3586                            )
 3587
 3588                            # For each annotation fields (and indexes)
 3589                            for cyvcf2_header_index in cyvcf2_header_indexes:
 3590
 3591                                # If value is NOT nNone
 3592                                if not np.isnan(
 3593                                    res[cyvcf2_header_indexes[cyvcf2_header_index]]
 3594                                ):
 3595                                    variant.INFO[cyvcf2_header_index] = res[
 3596                                        cyvcf2_header_indexes[cyvcf2_header_index]
 3597                                    ]
 3598
 3599                        # Add record in output file
 3600                        output_vcf.write_record(variant)
 3601
 3602                    # Log
 3603                    log.debug(f"Annotation done.")
 3604
 3605                    # Close and write file
 3606                    log.info(f"Annotations 'bigwig' write...")
 3607                    output_vcf.close()
 3608                    log.debug(f"Write done.")
 3609
 3610                    # Update variants
 3611                    log.info(f"Annotations 'bigwig' update...")
 3612                    self.update_from_vcf(output_vcf_file)
 3613                    log.debug(f"Update done.")
 3614
 3615        return True
 3616
 3617    def annotation_snpsift(self, threads: int = None) -> None:
 3618        """
 3619        This function annotate with bcftools
 3620
 3621        :param threads: Number of threads to use
 3622        :return: the value of the variable "return_value".
 3623        """
 3624
 3625        # DEBUG
 3626        log.debug("Start annotation with bcftools databases")
 3627
 3628        # Threads
 3629        if not threads:
 3630            threads = self.get_threads()
 3631        log.debug("Threads: " + str(threads))
 3632
 3633        # Config
 3634        config = self.get_config()
 3635        log.debug("Config: " + str(config))
 3636
 3637        # Config - snpSift
 3638        snpsift_bin_command = get_bin_command(
 3639            bin="SnpSift.jar",
 3640            tool="snpsift",
 3641            bin_type="jar",
 3642            config=config,
 3643            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 3644        )
 3645        if not snpsift_bin_command:
 3646            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
 3647            log.error(msg_err)
 3648            raise ValueError(msg_err)
 3649
 3650        # Config - bcftools
 3651        bcftools_bin_command = get_bin_command(
 3652            bin="bcftools",
 3653            tool="bcftools",
 3654            bin_type="bin",
 3655            config=config,
 3656            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3657        )
 3658        if not bcftools_bin_command:
 3659            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3660            log.error(msg_err)
 3661            raise ValueError(msg_err)
 3662
 3663        # Config - BCFTools databases folders
 3664        databases_folders = set(
 3665            self.get_config()
 3666            .get("folders", {})
 3667            .get("databases", {})
 3668            .get("annotations", ["."])
 3669            + self.get_config()
 3670            .get("folders", {})
 3671            .get("databases", {})
 3672            .get("bcftools", ["."])
 3673        )
 3674        log.debug("Databases annotations: " + str(databases_folders))
 3675
 3676        # Param
 3677        annotations = (
 3678            self.get_param()
 3679            .get("annotation", {})
 3680            .get("snpsift", {})
 3681            .get("annotations", None)
 3682        )
 3683        log.debug("Annotations: " + str(annotations))
 3684
 3685        # Assembly
 3686        assembly = self.get_param().get(
 3687            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3688        )
 3689
 3690        # Data
 3691        table_variants = self.get_table_variants()
 3692
 3693        # Check if not empty
 3694        log.debug("Check if not empty")
 3695        sql_query_chromosomes = (
 3696            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3697        )
 3698        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3699        if not sql_query_chromosomes_df["count"][0]:
 3700            log.info(f"VCF empty")
 3701            return
 3702
 3703        # VCF header
 3704        vcf_reader = self.get_header()
 3705        log.debug("Initial header: " + str(vcf_reader.infos))
 3706
 3707        # Existing annotations
 3708        for vcf_annotation in self.get_header().infos:
 3709
 3710            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3711            log.debug(
 3712                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3713            )
 3714
 3715        if annotations:
 3716
 3717            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3718
 3719                # Export VCF file
 3720                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3721
 3722                # Init
 3723                commands = {}
 3724
 3725                for annotation in annotations:
 3726                    annotation_fields = annotations[annotation]
 3727
 3728                    # Annotation Name
 3729                    annotation_name = os.path.basename(annotation)
 3730
 3731                    if not annotation_fields:
 3732                        annotation_fields = {"INFO": None}
 3733
 3734                    log.debug(f"Annotation '{annotation_name}'")
 3735                    log.debug(
 3736                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3737                    )
 3738
 3739                    # Create Database
 3740                    database = Database(
 3741                        database=annotation,
 3742                        databases_folders=databases_folders,
 3743                        assembly=assembly,
 3744                    )
 3745
 3746                    # Find files
 3747                    db_file = database.get_database()
 3748                    db_file = full_path(db_file)
 3749                    db_hdr_file = database.get_header_file()
 3750                    db_hdr_file = full_path(db_hdr_file)
 3751                    db_file_type = database.get_format()
 3752                    db_tbi_file = f"{db_file}.tbi"
 3753                    db_file_compressed = database.is_compressed()
 3754
 3755                    # Check if compressed
 3756                    if not db_file_compressed:
 3757                        log.error(
 3758                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3759                        )
 3760                        raise ValueError(
 3761                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3762                        )
 3763
 3764                    # Check if indexed
 3765                    if not os.path.exists(db_tbi_file):
 3766                        log.error(
 3767                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3768                        )
 3769                        raise ValueError(
 3770                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3771                        )
 3772
 3773                    # Check index - try to create if not exists
 3774                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3775                        log.error("Annotation failed: database not valid")
 3776                        log.error(f"Annotation annotation file: {db_file}")
 3777                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3778                        log.error(f"Annotation annotation index: {db_tbi_file}")
 3779                        raise ValueError(
 3780                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3781                        )
 3782                    else:
 3783
 3784                        log.debug(
 3785                            f"Annotation '{annotation}' - file: "
 3786                            + str(db_file)
 3787                            + " and "
 3788                            + str(db_hdr_file)
 3789                        )
 3790
 3791                        # Load header as VCF object
 3792                        db_hdr_vcf = Variants(input=db_hdr_file)
 3793                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3794                        log.debug(
 3795                            "Annotation database header: "
 3796                            + str(db_hdr_vcf_header_infos)
 3797                        )
 3798
 3799                        # For all fields in database
 3800                        annotation_fields_full = False
 3801                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3802                            annotation_fields = {
 3803                                key: key for key in db_hdr_vcf_header_infos
 3804                            }
 3805                            log.debug(
 3806                                "Annotation database header - All annotations added: "
 3807                                + str(annotation_fields)
 3808                            )
 3809                            annotation_fields_full = True
 3810
 3811                        # # Create file for field rename
 3812                        # log.debug("Create file for field rename")
 3813                        # tmp_rename = NamedTemporaryFile(
 3814                        #     prefix=self.get_prefix(),
 3815                        #     dir=self.get_tmp_dir(),
 3816                        #     suffix=".rename",
 3817                        #     delete=False,
 3818                        # )
 3819                        # tmp_rename_name = tmp_rename.name
 3820                        # tmp_files.append(tmp_rename_name)
 3821
 3822                        # Number of fields
 3823                        nb_annotation_field = 0
 3824                        annotation_list = []
 3825                        annotation_infos_rename_list = []
 3826
 3827                        for annotation_field in annotation_fields:
 3828
 3829                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3830                            annotation_fields_new_name = annotation_fields.get(
 3831                                annotation_field, annotation_field
 3832                            )
 3833                            if not annotation_fields_new_name:
 3834                                annotation_fields_new_name = annotation_field
 3835
 3836                            # Check if field is in DB and if field is not elready in input data
 3837                            if (
 3838                                annotation_field in db_hdr_vcf.get_header().infos
 3839                                and annotation_fields_new_name
 3840                                not in self.get_header().infos
 3841                            ):
 3842
 3843                                log.info(
 3844                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3845                                )
 3846
 3847                                # BCFTools annotate param to rename fields
 3848                                if annotation_field != annotation_fields_new_name:
 3849                                    annotation_infos_rename_list.append(
 3850                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3851                                    )
 3852
 3853                                # Add INFO field to header
 3854                                db_hdr_vcf_header_infos_number = (
 3855                                    db_hdr_vcf_header_infos[annotation_field].num or "."
 3856                                )
 3857                                db_hdr_vcf_header_infos_type = (
 3858                                    db_hdr_vcf_header_infos[annotation_field].type
 3859                                    or "String"
 3860                                )
 3861                                db_hdr_vcf_header_infos_description = (
 3862                                    db_hdr_vcf_header_infos[annotation_field].desc
 3863                                    or f"{annotation_field} description"
 3864                                )
 3865                                db_hdr_vcf_header_infos_source = (
 3866                                    db_hdr_vcf_header_infos[annotation_field].source
 3867                                    or "unknown"
 3868                                )
 3869                                db_hdr_vcf_header_infos_version = (
 3870                                    db_hdr_vcf_header_infos[annotation_field].version
 3871                                    or "unknown"
 3872                                )
 3873
 3874                                vcf_reader.infos[annotation_fields_new_name] = (
 3875                                    vcf.parser._Info(
 3876                                        annotation_fields_new_name,
 3877                                        db_hdr_vcf_header_infos_number,
 3878                                        db_hdr_vcf_header_infos_type,
 3879                                        db_hdr_vcf_header_infos_description,
 3880                                        db_hdr_vcf_header_infos_source,
 3881                                        db_hdr_vcf_header_infos_version,
 3882                                        self.code_type_map[
 3883                                            db_hdr_vcf_header_infos_type
 3884                                        ],
 3885                                    )
 3886                                )
 3887
 3888                                annotation_list.append(annotation_field)
 3889
 3890                                nb_annotation_field += 1
 3891
 3892                            else:
 3893
 3894                                if (
 3895                                    annotation_field
 3896                                    not in db_hdr_vcf.get_header().infos
 3897                                ):
 3898                                    log.warning(
 3899                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
 3900                                    )
 3901                                if (
 3902                                    annotation_fields_new_name
 3903                                    in self.get_header().infos
 3904                                ):
 3905                                    log.warning(
 3906                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3907                                    )
 3908
 3909                        log.info(
 3910                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3911                        )
 3912
 3913                        annotation_infos = ",".join(annotation_list)
 3914
 3915                        if annotation_infos != "":
 3916
 3917                            # Annotated VCF (and error file)
 3918                            tmp_annotation_vcf_name = os.path.join(
 3919                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
 3920                            )
 3921                            tmp_annotation_vcf_name_err = (
 3922                                tmp_annotation_vcf_name + ".err"
 3923                            )
 3924
 3925                            # Add fields to annotate
 3926                            if not annotation_fields_full:
 3927                                annotation_infos_option = f"-info {annotation_infos}"
 3928                            else:
 3929                                annotation_infos_option = ""
 3930
 3931                            # Info fields rename
 3932                            if annotation_infos_rename_list:
 3933                                annotation_infos_rename = " -c " + ",".join(
 3934                                    annotation_infos_rename_list
 3935                                )
 3936                            else:
 3937                                annotation_infos_rename = ""
 3938
 3939                            # Annotate command
 3940                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3941
 3942                            # Add command
 3943                            commands[command_annotate] = tmp_annotation_vcf_name
 3944
 3945                if commands:
 3946
 3947                    # Export VCF file
 3948                    self.export_variant_vcf(
 3949                        vcf_file=tmp_vcf_name,
 3950                        remove_info=True,
 3951                        add_samples=False,
 3952                        index=True,
 3953                    )
 3954                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
 3955
 3956                    # Num command
 3957                    nb_command = 0
 3958
 3959                    # Annotate
 3960                    for command_annotate in commands:
 3961                        nb_command += 1
 3962                        log.info(
 3963                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
 3964                        )
 3965                        log.debug(f"command_annotate={command_annotate}")
 3966                        run_parallel_commands([command_annotate], threads)
 3967
 3968                        # Debug
 3969                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
 3970
 3971                        # Update variants
 3972                        log.info(
 3973                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
 3974                        )
 3975                        self.update_from_vcf(commands[command_annotate])
 3976
 3977    def annotation_bcftools(self, threads: int = None) -> None:
 3978        """
 3979        This function annotate with bcftools
 3980
 3981        :param threads: Number of threads to use
 3982        :return: the value of the variable "return_value".
 3983        """
 3984
 3985        # DEBUG
 3986        log.debug("Start annotation with bcftools databases")
 3987
 3988        # Threads
 3989        if not threads:
 3990            threads = self.get_threads()
 3991        log.debug("Threads: " + str(threads))
 3992
 3993        # Config
 3994        config = self.get_config()
 3995        log.debug("Config: " + str(config))
 3996
 3997        # DEBUG
 3998        delete_tmp = True
 3999        if self.get_config().get("verbosity", "warning") in ["debug"]:
 4000            delete_tmp = False
 4001            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 4002
 4003        # Config - BCFTools bin command
 4004        bcftools_bin_command = get_bin_command(
 4005            bin="bcftools",
 4006            tool="bcftools",
 4007            bin_type="bin",
 4008            config=config,
 4009            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 4010        )
 4011        if not bcftools_bin_command:
 4012            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 4013            log.error(msg_err)
 4014            raise ValueError(msg_err)
 4015
 4016        # Config - BCFTools databases folders
 4017        databases_folders = set(
 4018            self.get_config()
 4019            .get("folders", {})
 4020            .get("databases", {})
 4021            .get("annotations", ["."])
 4022            + self.get_config()
 4023            .get("folders", {})
 4024            .get("databases", {})
 4025            .get("bcftools", ["."])
 4026        )
 4027        log.debug("Databases annotations: " + str(databases_folders))
 4028
 4029        # Param
 4030        annotations = (
 4031            self.get_param()
 4032            .get("annotation", {})
 4033            .get("bcftools", {})
 4034            .get("annotations", None)
 4035        )
 4036        log.debug("Annotations: " + str(annotations))
 4037
 4038        # Assembly
 4039        assembly = self.get_param().get(
 4040            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 4041        )
 4042
 4043        # Data
 4044        table_variants = self.get_table_variants()
 4045
 4046        # Check if not empty
 4047        log.debug("Check if not empty")
 4048        sql_query_chromosomes = (
 4049            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4050        )
 4051        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 4052        if not sql_query_chromosomes_df["count"][0]:
 4053            log.info(f"VCF empty")
 4054            return
 4055
 4056        # Export in VCF
 4057        log.debug("Create initial file to annotate")
 4058        tmp_vcf = NamedTemporaryFile(
 4059            prefix=self.get_prefix(),
 4060            dir=self.get_tmp_dir(),
 4061            suffix=".vcf.gz",
 4062            delete=False,
 4063        )
 4064        tmp_vcf_name = tmp_vcf.name
 4065
 4066        # VCF header
 4067        vcf_reader = self.get_header()
 4068        log.debug("Initial header: " + str(vcf_reader.infos))
 4069
 4070        # Existing annotations
 4071        for vcf_annotation in self.get_header().infos:
 4072
 4073            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 4074            log.debug(
 4075                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 4076            )
 4077
 4078        if annotations:
 4079
 4080            tmp_ann_vcf_list = []
 4081            commands = []
 4082            tmp_files = []
 4083            err_files = []
 4084
 4085            for annotation in annotations:
 4086                annotation_fields = annotations[annotation]
 4087
 4088                # Annotation Name
 4089                annotation_name = os.path.basename(annotation)
 4090
 4091                if not annotation_fields:
 4092                    annotation_fields = {"INFO": None}
 4093
 4094                log.debug(f"Annotation '{annotation_name}'")
 4095                log.debug(
 4096                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 4097                )
 4098
 4099                # Create Database
 4100                database = Database(
 4101                    database=annotation,
 4102                    databases_folders=databases_folders,
 4103                    assembly=assembly,
 4104                )
 4105
 4106                # Find files
 4107                db_file = database.get_database()
 4108                db_file = full_path(db_file)
 4109                db_hdr_file = database.get_header_file()
 4110                db_hdr_file = full_path(db_hdr_file)
 4111                db_file_type = database.get_format()
 4112                db_tbi_file = f"{db_file}.tbi"
 4113                db_file_compressed = database.is_compressed()
 4114
 4115                # Check if compressed
 4116                if not db_file_compressed:
 4117                    log.error(
 4118                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4119                    )
 4120                    raise ValueError(
 4121                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4122                    )
 4123
 4124                # Check if indexed
 4125                if not os.path.exists(db_tbi_file):
 4126                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
 4127                    raise ValueError(
 4128                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
 4129                    )
 4130
 4131                # Check index - try to create if not exists
 4132                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 4133                    log.error("Annotation failed: database not valid")
 4134                    log.error(f"Annotation annotation file: {db_file}")
 4135                    log.error(f"Annotation annotation header: {db_hdr_file}")
 4136                    log.error(f"Annotation annotation index: {db_tbi_file}")
 4137                    raise ValueError(
 4138                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 4139                    )
 4140                else:
 4141
 4142                    log.debug(
 4143                        f"Annotation '{annotation}' - file: "
 4144                        + str(db_file)
 4145                        + " and "
 4146                        + str(db_hdr_file)
 4147                    )
 4148
 4149                    # Load header as VCF object
 4150                    db_hdr_vcf = Variants(input=db_hdr_file)
 4151                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 4152                    log.debug(
 4153                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
 4154                    )
 4155
 4156                    # For all fields in database
 4157                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 4158                        annotation_fields = {
 4159                            key: key for key in db_hdr_vcf_header_infos
 4160                        }
 4161                        log.debug(
 4162                            "Annotation database header - All annotations added: "
 4163                            + str(annotation_fields)
 4164                        )
 4165
 4166                    # Number of fields
 4167                    nb_annotation_field = 0
 4168                    annotation_list = []
 4169
 4170                    for annotation_field in annotation_fields:
 4171
 4172                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 4173                        annotation_fields_new_name = annotation_fields.get(
 4174                            annotation_field, annotation_field
 4175                        )
 4176                        if not annotation_fields_new_name:
 4177                            annotation_fields_new_name = annotation_field
 4178
 4179                        # Check if field is in DB and if field is not elready in input data
 4180                        if (
 4181                            annotation_field in db_hdr_vcf.get_header().infos
 4182                            and annotation_fields_new_name
 4183                            not in self.get_header().infos
 4184                        ):
 4185
 4186                            log.info(
 4187                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 4188                            )
 4189
 4190                            # Add INFO field to header
 4191                            db_hdr_vcf_header_infos_number = (
 4192                                db_hdr_vcf_header_infos[annotation_field].num or "."
 4193                            )
 4194                            db_hdr_vcf_header_infos_type = (
 4195                                db_hdr_vcf_header_infos[annotation_field].type
 4196                                or "String"
 4197                            )
 4198                            db_hdr_vcf_header_infos_description = (
 4199                                db_hdr_vcf_header_infos[annotation_field].desc
 4200                                or f"{annotation_field} description"
 4201                            )
 4202                            db_hdr_vcf_header_infos_source = (
 4203                                db_hdr_vcf_header_infos[annotation_field].source
 4204                                or "unknown"
 4205                            )
 4206                            db_hdr_vcf_header_infos_version = (
 4207                                db_hdr_vcf_header_infos[annotation_field].version
 4208                                or "unknown"
 4209                            )
 4210
 4211                            vcf_reader.infos[annotation_fields_new_name] = (
 4212                                vcf.parser._Info(
 4213                                    annotation_fields_new_name,
 4214                                    db_hdr_vcf_header_infos_number,
 4215                                    db_hdr_vcf_header_infos_type,
 4216                                    db_hdr_vcf_header_infos_description,
 4217                                    db_hdr_vcf_header_infos_source,
 4218                                    db_hdr_vcf_header_infos_version,
 4219                                    self.code_type_map[db_hdr_vcf_header_infos_type],
 4220                                )
 4221                            )
 4222
 4223                            # annotation_list.append(annotation_field)
 4224                            if annotation_field != annotation_fields_new_name:
 4225                                annotation_list.append(
 4226                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 4227                                )
 4228                            else:
 4229                                annotation_list.append(annotation_field)
 4230
 4231                            nb_annotation_field += 1
 4232
 4233                        else:
 4234
 4235                            if annotation_field not in db_hdr_vcf.get_header().infos:
 4236                                log.warning(
 4237                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
 4238                                )
 4239                            if annotation_fields_new_name in self.get_header().infos:
 4240                                log.warning(
 4241                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 4242                                )
 4243
 4244                    log.info(
 4245                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 4246                    )
 4247
 4248                    annotation_infos = ",".join(annotation_list)
 4249
 4250                    if annotation_infos != "":
 4251
 4252                        # Protect header for bcftools (remove "#CHROM" and variants line)
 4253                        log.debug("Protect Header file - remove #CHROM line if exists")
 4254                        tmp_header_vcf = NamedTemporaryFile(
 4255                            prefix=self.get_prefix(),
 4256                            dir=self.get_tmp_dir(),
 4257                            suffix=".hdr",
 4258                            delete=False,
 4259                        )
 4260                        tmp_header_vcf_name = tmp_header_vcf.name
 4261                        tmp_files.append(tmp_header_vcf_name)
 4262                        # Command
 4263                        if db_hdr_file.endswith(".gz"):
 4264                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4265                        else:
 4266                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4267                        # Run
 4268                        run_parallel_commands([command_extract_header], 1)
 4269
 4270                        # Find chomosomes
 4271                        log.debug("Find chromosomes ")
 4272                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
 4273                        sql_query_chromosomes_df = self.get_query_to_df(
 4274                            sql_query_chromosomes
 4275                        )
 4276                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
 4277
 4278                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
 4279
 4280                        # BED columns in the annotation file
 4281                        if db_file_type in ["bed"]:
 4282                            annotation_infos = "CHROM,POS,POS," + annotation_infos
 4283
 4284                        for chrom in chomosomes_list:
 4285
 4286                            # Create BED on initial VCF
 4287                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
 4288                            tmp_bed = NamedTemporaryFile(
 4289                                prefix=self.get_prefix(),
 4290                                dir=self.get_tmp_dir(),
 4291                                suffix=".bed",
 4292                                delete=False,
 4293                            )
 4294                            tmp_bed_name = tmp_bed.name
 4295                            tmp_files.append(tmp_bed_name)
 4296
 4297                            # Detecte regions
 4298                            log.debug(
 4299                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
 4300                            )
 4301                            window = 1000000
 4302                            sql_query_intervals_for_bed = f"""
 4303                                SELECT  \"#CHROM\",
 4304                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
 4305                                        \"POS\"+{window}
 4306                                FROM {table_variants} as table_variants
 4307                                WHERE table_variants.\"#CHROM\" = '{chrom}'
 4308                            """
 4309                            regions = self.conn.execute(
 4310                                sql_query_intervals_for_bed
 4311                            ).fetchall()
 4312                            merged_regions = merge_regions(regions)
 4313                            log.debug(
 4314                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
 4315                            )
 4316
 4317                            header = ["#CHROM", "START", "END"]
 4318                            with open(tmp_bed_name, "w") as f:
 4319                                # Write the header with tab delimiter
 4320                                f.write("\t".join(header) + "\n")
 4321                                for d in merged_regions:
 4322                                    # Write each data row with tab delimiter
 4323                                    f.write("\t".join(map(str, d)) + "\n")
 4324
 4325                            # Tmp files
 4326                            tmp_annotation_vcf = NamedTemporaryFile(
 4327                                prefix=self.get_prefix(),
 4328                                dir=self.get_tmp_dir(),
 4329                                suffix=".vcf.gz",
 4330                                delete=False,
 4331                            )
 4332                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
 4333                            tmp_files.append(tmp_annotation_vcf_name)
 4334                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
 4335                            tmp_annotation_vcf_name_err = (
 4336                                tmp_annotation_vcf_name + ".err"
 4337                            )
 4338                            err_files.append(tmp_annotation_vcf_name_err)
 4339
 4340                            # Annotate Command
 4341                            log.debug(
 4342                                f"Annotation '{annotation}' - add bcftools command"
 4343                            )
 4344
 4345                            # Command
 4346                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 4347
 4348                            # Add command
 4349                            commands.append(command_annotate)
 4350
 4351            # if some commands
 4352            if commands:
 4353
 4354                # Export VCF file
 4355                self.export_variant_vcf(
 4356                    vcf_file=tmp_vcf_name,
 4357                    remove_info=True,
 4358                    add_samples=False,
 4359                    index=True,
 4360                )
 4361
 4362                # Threads
 4363                # calculate threads for annotated commands
 4364                if commands:
 4365                    threads_bcftools_annotate = round(threads / len(commands))
 4366                else:
 4367                    threads_bcftools_annotate = 1
 4368
 4369                if not threads_bcftools_annotate:
 4370                    threads_bcftools_annotate = 1
 4371
 4372                # Add threads option to bcftools commands
 4373                if threads_bcftools_annotate > 1:
 4374                    commands_threaded = []
 4375                    for command in commands:
 4376                        commands_threaded.append(
 4377                            command.replace(
 4378                                f"{bcftools_bin_command} annotate ",
 4379                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
 4380                            )
 4381                        )
 4382                    commands = commands_threaded
 4383
 4384                # Command annotation multithreading
 4385                log.debug(f"Annotation - Annotation commands: " + str(commands))
 4386                log.info(
 4387                    f"Annotation - Annotation multithreaded in "
 4388                    + str(len(commands))
 4389                    + " commands"
 4390                )
 4391
 4392                run_parallel_commands(commands, threads)
 4393
 4394                # Merge
 4395                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
 4396
 4397                if tmp_ann_vcf_list_cmd:
 4398
 4399                    # Tmp file
 4400                    tmp_annotate_vcf = NamedTemporaryFile(
 4401                        prefix=self.get_prefix(),
 4402                        dir=self.get_tmp_dir(),
 4403                        suffix=".vcf.gz",
 4404                        delete=True,
 4405                    )
 4406                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
 4407                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 4408                    err_files.append(tmp_annotate_vcf_name_err)
 4409
 4410                    # Tmp file remove command
 4411                    tmp_files_remove_command = ""
 4412                    if tmp_files:
 4413                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
 4414
 4415                    # Command merge
 4416                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
 4417                    log.info(
 4418                        f"Annotation - Annotation merging "
 4419                        + str(len(commands))
 4420                        + " annotated files"
 4421                    )
 4422                    log.debug(f"Annotation - merge command: {merge_command}")
 4423                    run_parallel_commands([merge_command], 1)
 4424
 4425                    # Error messages
 4426                    log.info(f"Error/Warning messages:")
 4427                    error_message_command_all = []
 4428                    error_message_command_warning = []
 4429                    error_message_command_err = []
 4430                    for err_file in err_files:
 4431                        with open(err_file, "r") as f:
 4432                            for line in f:
 4433                                message = line.strip()
 4434                                error_message_command_all.append(message)
 4435                                if line.startswith("[W::"):
 4436                                    error_message_command_warning.append(message)
 4437                                if line.startswith("[E::"):
 4438                                    error_message_command_err.append(
 4439                                        f"{err_file}: " + message
 4440                                    )
 4441                    # log info
 4442                    for message in list(
 4443                        set(error_message_command_err + error_message_command_warning)
 4444                    ):
 4445                        log.info(f"   {message}")
 4446                    # debug info
 4447                    for message in list(set(error_message_command_all)):
 4448                        log.debug(f"   {message}")
 4449                    # failed
 4450                    if len(error_message_command_err):
 4451                        log.error("Annotation failed: Error in commands")
 4452                        raise ValueError("Annotation failed: Error in commands")
 4453
 4454                    # Update variants
 4455                    log.info(f"Annotation - Updating...")
 4456                    self.update_from_vcf(tmp_annotate_vcf_name)
 4457
 4458    def annotation_exomiser(self, threads: int = None) -> None:
 4459        """
 4460        This function annotate with Exomiser
 4461
 4462        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
 4463        - "analysis" (dict/file):
 4464            Full analysis dictionnary parameters (see Exomiser docs).
 4465            Either a dict, or a file in JSON or YAML format.
 4466            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
 4467            Default : None
 4468        - "preset" (string):
 4469            Analysis preset (available in config folder).
 4470            Used if no full "analysis" is provided.
 4471            Default: "exome"
 4472        - "phenopacket" (dict/file):
 4473            Samples and phenotipic features parameters (see Exomiser docs).
 4474            Either a dict, or a file in JSON or YAML format.
 4475            Default: None
 4476        - "subject" (dict):
 4477            Sample parameters (see Exomiser docs).
 4478            Example:
 4479                "subject":
 4480                    {
 4481                        "id": "ISDBM322017",
 4482                        "sex": "FEMALE"
 4483                    }
 4484            Default: None
 4485        - "sample" (string):
 4486            Sample name to construct "subject" section:
 4487                "subject":
 4488                    {
 4489                        "id": "<sample>",
 4490                        "sex": "UNKNOWN_SEX"
 4491                    }
 4492            Default: None
 4493        - "phenotypicFeatures" (dict)
 4494            Phenotypic features to construct "subject" section.
 4495            Example:
 4496                "phenotypicFeatures":
 4497                    [
 4498                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
 4499                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
 4500                    ]
 4501        - "hpo" (list)
 4502            List of HPO ids as phenotypic features.
 4503            Example:
 4504                "hpo": ['0001156', '0001363', '0011304', '0010055']
 4505            Default: []
 4506        - "outputOptions" (dict):
 4507            Output options (see Exomiser docs).
 4508            Default:
 4509                "output_options" =
 4510                    {
 4511                        "outputContributingVariantsOnly": False,
 4512                        "numGenes": 0,
 4513                        "outputFormats": ["TSV_VARIANT", "VCF"]
 4514                    }
 4515        - "transcript_source" (string):
 4516            Transcript source (either "refseq", "ucsc", "ensembl")
 4517            Default: "refseq"
 4518        - "exomiser_to_info" (boolean):
 4519            Add exomiser TSV file columns as INFO fields in VCF.
 4520            Default: False
 4521        - "release" (string):
 4522            Exomise database release.
 4523            If not exists, database release will be downloaded (take a while).
 4524            Default: None (provided by application.properties configuration file)
 4525        - "exomiser_application_properties" (file):
 4526            Exomiser configuration file (see Exomiser docs).
 4527            Useful to automatically download databases (especially for specific genome databases).
 4528
 4529        Notes:
 4530        - If no sample in parameters, first sample in VCF will be chosen
 4531        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
 4532
 4533        :param threads: The number of threads to use
 4534        :return: None.
 4535        """
 4536
 4537        # DEBUG
 4538        log.debug("Start annotation with Exomiser databases")
 4539
 4540        # Threads
 4541        if not threads:
 4542            threads = self.get_threads()
 4543        log.debug("Threads: " + str(threads))
 4544
 4545        # Config
 4546        config = self.get_config()
 4547        log.debug("Config: " + str(config))
 4548
 4549        # Config - Folders - Databases
 4550        databases_folders = (
 4551            config.get("folders", {})
 4552            .get("databases", {})
 4553            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
 4554        )
 4555        databases_folders = full_path(databases_folders)
 4556        if not os.path.exists(databases_folders):
 4557            log.error(f"Databases annotations: {databases_folders} NOT found")
 4558        log.debug("Databases annotations: " + str(databases_folders))
 4559
 4560        # Config - Exomiser
 4561        exomiser_bin_command = get_bin_command(
 4562            bin="exomiser-cli*.jar",
 4563            tool="exomiser",
 4564            bin_type="jar",
 4565            config=config,
 4566            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
 4567        )
 4568        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
 4569        if not exomiser_bin_command:
 4570            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
 4571            log.error(msg_err)
 4572            raise ValueError(msg_err)
 4573
 4574        # Param
 4575        param = self.get_param()
 4576        log.debug("Param: " + str(param))
 4577
 4578        # Param - Exomiser
 4579        param_exomiser = param.get("annotation", {}).get("exomiser", {})
 4580        log.debug(f"Param Exomiser: {param_exomiser}")
 4581
 4582        # Param - Assembly
 4583        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4584        log.debug("Assembly: " + str(assembly))
 4585
 4586        # Data
 4587        table_variants = self.get_table_variants()
 4588
 4589        # Check if not empty
 4590        log.debug("Check if not empty")
 4591        sql_query_chromosomes = (
 4592            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4593        )
 4594        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4595            log.info(f"VCF empty")
 4596            return False
 4597
 4598        # VCF header
 4599        vcf_reader = self.get_header()
 4600        log.debug("Initial header: " + str(vcf_reader.infos))
 4601
 4602        # Samples
 4603        samples = self.get_header_sample_list()
 4604        if not samples:
 4605            log.error("No Samples in VCF")
 4606            return False
 4607        log.debug(f"Samples: {samples}")
 4608
 4609        # Memory limit
 4610        memory_limit = self.get_memory("8G")
 4611        log.debug(f"memory_limit: {memory_limit}")
 4612
 4613        # Exomiser java options
 4614        exomiser_java_options = (
 4615            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4616        )
 4617        log.debug(f"Exomiser java options: {exomiser_java_options}")
 4618
 4619        # Download Exomiser (if not exists)
 4620        exomiser_release = param_exomiser.get("release", None)
 4621        exomiser_application_properties = param_exomiser.get(
 4622            "exomiser_application_properties", None
 4623        )
 4624        databases_download_exomiser(
 4625            assemblies=[assembly],
 4626            exomiser_folder=databases_folders,
 4627            exomiser_release=exomiser_release,
 4628            exomiser_phenotype_release=exomiser_release,
 4629            exomiser_application_properties=exomiser_application_properties,
 4630        )
 4631
 4632        # Force annotation
 4633        force_update_annotation = True
 4634
 4635        if "Exomiser" not in self.get_header().infos or force_update_annotation:
 4636            log.debug("Start annotation Exomiser")
 4637
 4638            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 4639
 4640                # tmp_dir = "/tmp/exomiser"
 4641
 4642                ### ANALYSIS ###
 4643                ################
 4644
 4645                # Create analysis.json through analysis dict
 4646                # either analysis in param or by default
 4647                # depending on preset exome/genome)
 4648
 4649                # Init analysis dict
 4650                param_exomiser_analysis_dict = {}
 4651
 4652                # analysis from param
 4653                param_exomiser_analysis = param_exomiser.get("analysis", {})
 4654                param_exomiser_analysis = full_path(param_exomiser_analysis)
 4655
 4656                # If analysis in param -> load anlaysis json
 4657                if param_exomiser_analysis:
 4658
 4659                    # If param analysis is a file and exists
 4660                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
 4661                        param_exomiser_analysis
 4662                    ):
 4663                        # Load analysis file into analysis dict (either yaml or json)
 4664                        with open(param_exomiser_analysis) as json_file:
 4665                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
 4666
 4667                    # If param analysis is a dict
 4668                    elif isinstance(param_exomiser_analysis, dict):
 4669                        # Load analysis dict into analysis dict (either yaml or json)
 4670                        param_exomiser_analysis_dict = param_exomiser_analysis
 4671
 4672                    # Error analysis type
 4673                    else:
 4674                        log.error(f"Analysis type unknown. Check param file.")
 4675                        raise ValueError(f"Analysis type unknown. Check param file.")
 4676
 4677                # Case no input analysis config file/dict
 4678                # Use preset (exome/genome) to open default config file
 4679                if not param_exomiser_analysis_dict:
 4680
 4681                    # default preset
 4682                    default_preset = "exome"
 4683
 4684                    # Get param preset or default preset
 4685                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
 4686
 4687                    # Try to find if preset is a file
 4688                    if os.path.exists(param_exomiser_preset):
 4689                        # Preset file is provided in full path
 4690                        param_exomiser_analysis_default_config_file = (
 4691                            param_exomiser_preset
 4692                        )
 4693                    # elif os.path.exists(full_path(param_exomiser_preset)):
 4694                    #     # Preset file is provided in full path
 4695                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
 4696                    elif os.path.exists(
 4697                        os.path.join(folder_config, param_exomiser_preset)
 4698                    ):
 4699                        # Preset file is provided a basename in config folder (can be a path with subfolders)
 4700                        param_exomiser_analysis_default_config_file = os.path.join(
 4701                            folder_config, param_exomiser_preset
 4702                        )
 4703                    else:
 4704                        # Construct preset file
 4705                        param_exomiser_analysis_default_config_file = os.path.join(
 4706                            folder_config,
 4707                            f"preset-{param_exomiser_preset}-analysis.json",
 4708                        )
 4709
 4710                    # If preset file exists
 4711                    param_exomiser_analysis_default_config_file = full_path(
 4712                        param_exomiser_analysis_default_config_file
 4713                    )
 4714                    if os.path.exists(param_exomiser_analysis_default_config_file):
 4715                        # Load prest file into analysis dict (either yaml or json)
 4716                        with open(
 4717                            param_exomiser_analysis_default_config_file
 4718                        ) as json_file:
 4719                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
 4720                                json_file
 4721                            )
 4722
 4723                    # Error preset file
 4724                    else:
 4725                        log.error(
 4726                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4727                        )
 4728                        raise ValueError(
 4729                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4730                        )
 4731
 4732                # If no analysis dict created
 4733                if not param_exomiser_analysis_dict:
 4734                    log.error(f"No analysis config")
 4735                    raise ValueError(f"No analysis config")
 4736
 4737                # Log
 4738                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4739
 4740                ### PHENOPACKET ###
 4741                ###################
 4742
 4743                # If no PhenoPacket in analysis dict -> check in param
 4744                if "phenopacket" not in param_exomiser_analysis_dict:
 4745
 4746                    # If PhenoPacket in param -> load anlaysis json
 4747                    if param_exomiser.get("phenopacket", None):
 4748
 4749                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
 4750                        param_exomiser_phenopacket = full_path(
 4751                            param_exomiser_phenopacket
 4752                        )
 4753
 4754                        # If param phenopacket is a file and exists
 4755                        if isinstance(
 4756                            param_exomiser_phenopacket, str
 4757                        ) and os.path.exists(param_exomiser_phenopacket):
 4758                            # Load phenopacket file into analysis dict (either yaml or json)
 4759                            with open(param_exomiser_phenopacket) as json_file:
 4760                                param_exomiser_analysis_dict["phenopacket"] = (
 4761                                    yaml.safe_load(json_file)
 4762                                )
 4763
 4764                        # If param phenopacket is a dict
 4765                        elif isinstance(param_exomiser_phenopacket, dict):
 4766                            # Load phenopacket dict into analysis dict (either yaml or json)
 4767                            param_exomiser_analysis_dict["phenopacket"] = (
 4768                                param_exomiser_phenopacket
 4769                            )
 4770
 4771                        # Error phenopacket type
 4772                        else:
 4773                            log.error(f"Phenopacket type unknown. Check param file.")
 4774                            raise ValueError(
 4775                                f"Phenopacket type unknown. Check param file."
 4776                            )
 4777
 4778                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
 4779                if "phenopacket" not in param_exomiser_analysis_dict:
 4780
 4781                    # Init PhenoPacket
 4782                    param_exomiser_analysis_dict["phenopacket"] = {
 4783                        "id": "analysis",
 4784                        "proband": {},
 4785                    }
 4786
 4787                    ### Add subject ###
 4788
 4789                    # If subject exists
 4790                    param_exomiser_subject = param_exomiser.get("subject", {})
 4791
 4792                    # If subject not exists -> found sample ID
 4793                    if not param_exomiser_subject:
 4794
 4795                        # Found sample ID in param
 4796                        sample = param_exomiser.get("sample", None)
 4797
 4798                        # Find sample ID (first sample)
 4799                        if not sample:
 4800                            sample_list = self.get_header_sample_list()
 4801                            if len(sample_list) > 0:
 4802                                sample = sample_list[0]
 4803                            else:
 4804                                log.error(f"No sample found")
 4805                                raise ValueError(f"No sample found")
 4806
 4807                        # Create subject
 4808                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
 4809
 4810                    # Add to dict
 4811                    param_exomiser_analysis_dict["phenopacket"][
 4812                        "subject"
 4813                    ] = param_exomiser_subject
 4814
 4815                    ### Add "phenotypicFeatures" ###
 4816
 4817                    # If phenotypicFeatures exists
 4818                    param_exomiser_phenotypicfeatures = param_exomiser.get(
 4819                        "phenotypicFeatures", []
 4820                    )
 4821
 4822                    # If phenotypicFeatures not exists -> Try to infer from hpo list
 4823                    if not param_exomiser_phenotypicfeatures:
 4824
 4825                        # Found HPO in param
 4826                        param_exomiser_hpo = param_exomiser.get("hpo", [])
 4827
 4828                        # Split HPO if list in string format separated by comma
 4829                        if isinstance(param_exomiser_hpo, str):
 4830                            param_exomiser_hpo = param_exomiser_hpo.split(",")
 4831
 4832                        # Create HPO list
 4833                        for hpo in param_exomiser_hpo:
 4834                            hpo_clean = re.sub("[^0-9]", "", hpo)
 4835                            param_exomiser_phenotypicfeatures.append(
 4836                                {
 4837                                    "type": {
 4838                                        "id": f"HP:{hpo_clean}",
 4839                                        "label": f"HP:{hpo_clean}",
 4840                                    }
 4841                                }
 4842                            )
 4843
 4844                    # Add to dict
 4845                    param_exomiser_analysis_dict["phenopacket"][
 4846                        "phenotypicFeatures"
 4847                    ] = param_exomiser_phenotypicfeatures
 4848
 4849                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
 4850                    if not param_exomiser_phenotypicfeatures:
 4851                        for step in param_exomiser_analysis_dict.get(
 4852                            "analysis", {}
 4853                        ).get("steps", []):
 4854                            if "hiPhivePrioritiser" in step:
 4855                                param_exomiser_analysis_dict.get("analysis", {}).get(
 4856                                    "steps", []
 4857                                ).remove(step)
 4858
 4859                ### Add Input File ###
 4860
 4861                # Initial file name and htsFiles
 4862                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
 4863                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
 4864                    {
 4865                        "uri": tmp_vcf_name,
 4866                        "htsFormat": "VCF",
 4867                        "genomeAssembly": assembly,
 4868                    }
 4869                ]
 4870
 4871                ### Add metaData ###
 4872
 4873                # If metaData not in analysis dict
 4874                if "metaData" not in param_exomiser_analysis_dict:
 4875                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
 4876                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
 4877                        "createdBy": "howard",
 4878                        "phenopacketSchemaVersion": 1,
 4879                    }
 4880
 4881                ### OutputOptions ###
 4882
 4883                # Init output result folder
 4884                output_results = os.path.join(tmp_dir, "results")
 4885
 4886                # If no outputOptions in analysis dict
 4887                if "outputOptions" not in param_exomiser_analysis_dict:
 4888
 4889                    # default output formats
 4890                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
 4891
 4892                    # Get outputOptions in param
 4893                    output_options = param_exomiser.get("outputOptions", None)
 4894
 4895                    # If no output_options in param -> check
 4896                    if not output_options:
 4897                        output_options = {
 4898                            "outputContributingVariantsOnly": False,
 4899                            "numGenes": 0,
 4900                            "outputFormats": defaut_output_formats,
 4901                        }
 4902
 4903                    # Replace outputDirectory in output options
 4904                    output_options["outputDirectory"] = output_results
 4905                    output_options["outputFileName"] = "howard"
 4906
 4907                    # Add outputOptions in analysis dict
 4908                    param_exomiser_analysis_dict["outputOptions"] = output_options
 4909
 4910                else:
 4911
 4912                    # Replace output_results and output format (if exists in param)
 4913                    param_exomiser_analysis_dict["outputOptions"][
 4914                        "outputDirectory"
 4915                    ] = output_results
 4916                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
 4917                        list(
 4918                            set(
 4919                                param_exomiser_analysis_dict.get(
 4920                                    "outputOptions", {}
 4921                                ).get("outputFormats", [])
 4922                                + ["TSV_VARIANT", "VCF"]
 4923                            )
 4924                        )
 4925                    )
 4926
 4927                # log
 4928                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4929
 4930                ### ANALYSIS FILE ###
 4931                #####################
 4932
 4933                ### Full JSON analysis config file ###
 4934
 4935                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
 4936                with open(exomiser_analysis, "w") as fp:
 4937                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
 4938
 4939                ### SPLIT analysis and sample config files
 4940
 4941                # Splitted analysis dict
 4942                param_exomiser_analysis_dict_for_split = (
 4943                    param_exomiser_analysis_dict.copy()
 4944                )
 4945
 4946                # Phenopacket JSON file
 4947                exomiser_analysis_phenopacket = os.path.join(
 4948                    tmp_dir, "analysis_phenopacket.json"
 4949                )
 4950                with open(exomiser_analysis_phenopacket, "w") as fp:
 4951                    json.dump(
 4952                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
 4953                        fp,
 4954                        indent=4,
 4955                    )
 4956
 4957                # Analysis JSON file without Phenopacket parameters
 4958                param_exomiser_analysis_dict_for_split.pop("phenopacket")
 4959                exomiser_analysis_analysis = os.path.join(
 4960                    tmp_dir, "analysis_analysis.json"
 4961                )
 4962                with open(exomiser_analysis_analysis, "w") as fp:
 4963                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
 4964
 4965                ### INITAL VCF file ###
 4966                #######################
 4967
 4968                ### Create list of samples to use and include inti initial VCF file ####
 4969
 4970                # Subject (main sample)
 4971                # Get sample ID in analysis dict
 4972                sample_subject = (
 4973                    param_exomiser_analysis_dict.get("phenopacket", {})
 4974                    .get("subject", {})
 4975                    .get("id", None)
 4976                )
 4977                sample_proband = (
 4978                    param_exomiser_analysis_dict.get("phenopacket", {})
 4979                    .get("proband", {})
 4980                    .get("subject", {})
 4981                    .get("id", None)
 4982                )
 4983                sample = []
 4984                if sample_subject:
 4985                    sample.append(sample_subject)
 4986                if sample_proband:
 4987                    sample.append(sample_proband)
 4988
 4989                # Get sample ID within Pedigree
 4990                pedigree_persons_list = (
 4991                    param_exomiser_analysis_dict.get("phenopacket", {})
 4992                    .get("pedigree", {})
 4993                    .get("persons", {})
 4994                )
 4995
 4996                # Create list with all sample ID in pedigree (if exists)
 4997                pedigree_persons = []
 4998                for person in pedigree_persons_list:
 4999                    pedigree_persons.append(person.get("individualId"))
 5000
 5001                # Concat subject sample ID and samples ID in pedigreesamples
 5002                samples = list(set(sample + pedigree_persons))
 5003
 5004                # Check if sample list is not empty
 5005                if not samples:
 5006                    log.error(f"No samples found")
 5007                    raise ValueError(f"No samples found")
 5008
 5009                # Create VCF with sample (either sample in param or first one by default)
 5010                # Export VCF file
 5011                self.export_variant_vcf(
 5012                    vcf_file=tmp_vcf_name,
 5013                    remove_info=True,
 5014                    add_samples=True,
 5015                    list_samples=samples,
 5016                    index=False,
 5017                )
 5018
 5019                ### Execute Exomiser ###
 5020                ########################
 5021
 5022                # Init command
 5023                exomiser_command = ""
 5024
 5025                # Command exomiser options
 5026                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
 5027
 5028                # Release
 5029                exomiser_release = param_exomiser.get("release", None)
 5030                if exomiser_release:
 5031                    # phenotype data version
 5032                    exomiser_options += (
 5033                        f" --exomiser.phenotype.data-version={exomiser_release} "
 5034                    )
 5035                    # data version
 5036                    exomiser_options += (
 5037                        f" --exomiser.{assembly}.data-version={exomiser_release} "
 5038                    )
 5039                    # variant white list
 5040                    variant_white_list_file = (
 5041                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
 5042                    )
 5043                    if os.path.exists(
 5044                        os.path.join(
 5045                            databases_folders, assembly, variant_white_list_file
 5046                        )
 5047                    ):
 5048                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
 5049
 5050                # transcript_source
 5051                transcript_source = param_exomiser.get(
 5052                    "transcript_source", None
 5053                )  # ucsc, refseq, ensembl
 5054                if transcript_source:
 5055                    exomiser_options += (
 5056                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
 5057                    )
 5058
 5059                # If analysis contain proband param
 5060                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
 5061                    "proband", {}
 5062                ):
 5063                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
 5064
 5065                # If no proband (usually uniq sample)
 5066                else:
 5067                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
 5068
 5069                # Log
 5070                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
 5071
 5072                # Run command
 5073                result = subprocess.call(
 5074                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
 5075                )
 5076                if result:
 5077                    log.error("Exomiser command failed")
 5078                    raise ValueError("Exomiser command failed")
 5079
 5080                ### RESULTS ###
 5081                ###############
 5082
 5083                ### Annotate with TSV fields ###
 5084
 5085                # Init result tsv file
 5086                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
 5087
 5088                # Init result tsv file
 5089                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
 5090
 5091                # Parse TSV file and explode columns in INFO field
 5092                if exomiser_to_info and os.path.exists(output_results_tsv):
 5093
 5094                    # Log
 5095                    log.debug("Exomiser columns to VCF INFO field")
 5096
 5097                    # Retrieve columns and types
 5098                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
 5099                    output_results_tsv_df = self.get_query_to_df(query)
 5100                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
 5101
 5102                    # Init concat fields for update
 5103                    sql_query_update_concat_fields = []
 5104
 5105                    # Fields to avoid
 5106                    fields_to_avoid = [
 5107                        "CONTIG",
 5108                        "START",
 5109                        "END",
 5110                        "REF",
 5111                        "ALT",
 5112                        "QUAL",
 5113                        "FILTER",
 5114                        "GENOTYPE",
 5115                    ]
 5116
 5117                    # List all columns to add into header
 5118                    for header_column in output_results_tsv_columns:
 5119
 5120                        # If header column is enable
 5121                        if header_column not in fields_to_avoid:
 5122
 5123                            # Header info type
 5124                            header_info_type = "String"
 5125                            header_column_df = output_results_tsv_df[header_column]
 5126                            header_column_df_dtype = header_column_df.dtype
 5127                            if header_column_df_dtype == object:
 5128                                if (
 5129                                    pd.to_numeric(header_column_df, errors="coerce")
 5130                                    .notnull()
 5131                                    .all()
 5132                                ):
 5133                                    header_info_type = "Float"
 5134                            else:
 5135                                header_info_type = "Integer"
 5136
 5137                            # Header info
 5138                            characters_to_validate = ["-"]
 5139                            pattern = "[" + "".join(characters_to_validate) + "]"
 5140                            header_info_name = re.sub(
 5141                                pattern,
 5142                                "_",
 5143                                f"Exomiser_{header_column}".replace("#", ""),
 5144                            )
 5145                            header_info_number = "."
 5146                            header_info_description = (
 5147                                f"Exomiser {header_column} annotation"
 5148                            )
 5149                            header_info_source = "Exomiser"
 5150                            header_info_version = "unknown"
 5151                            header_info_code = CODE_TYPE_MAP[header_info_type]
 5152                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
 5153                                header_info_name,
 5154                                header_info_number,
 5155                                header_info_type,
 5156                                header_info_description,
 5157                                header_info_source,
 5158                                header_info_version,
 5159                                header_info_code,
 5160                            )
 5161
 5162                            # Add field to add for update to concat fields
 5163                            sql_query_update_concat_fields.append(
 5164                                f"""
 5165                                CASE
 5166                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
 5167                                    THEN concat(
 5168                                        '{header_info_name}=',
 5169                                        table_parquet."{header_column}",
 5170                                        ';'
 5171                                        )
 5172
 5173                                    ELSE ''
 5174                                END
 5175                            """
 5176                            )
 5177
 5178                    # Update query
 5179                    sql_query_update = f"""
 5180                        UPDATE {table_variants} as table_variants
 5181                            SET INFO = concat(
 5182                                            CASE
 5183                                                WHEN INFO NOT IN ('', '.')
 5184                                                THEN INFO
 5185                                                ELSE ''
 5186                                            END,
 5187                                            CASE
 5188                                                WHEN table_variants.INFO NOT IN ('','.')
 5189                                                THEN ';'
 5190                                                ELSE ''
 5191                                            END,
 5192                                            (
 5193                                            SELECT 
 5194                                                concat(
 5195                                                    {",".join(sql_query_update_concat_fields)}
 5196                                                )
 5197                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
 5198                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
 5199                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
 5200                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 5201                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 5202                                            )
 5203                                        )
 5204                            ;
 5205                        """
 5206
 5207                    # Update
 5208                    self.conn.execute(sql_query_update)
 5209
 5210                ### Annotate with VCF INFO field ###
 5211
 5212                # Init result VCF file
 5213                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
 5214
 5215                # If VCF exists
 5216                if os.path.exists(output_results_vcf):
 5217
 5218                    # Log
 5219                    log.debug("Exomiser result VCF update variants")
 5220
 5221                    # Find Exomiser INFO field annotation in header
 5222                    with gzip.open(output_results_vcf, "rt") as f:
 5223                        header_list = self.read_vcf_header(f)
 5224                    exomiser_vcf_header = vcf.Reader(
 5225                        io.StringIO("\n".join(header_list))
 5226                    )
 5227
 5228                    # Add annotation INFO field to header
 5229                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
 5230
 5231                    # Update variants with VCF
 5232                    self.update_from_vcf(output_results_vcf)
 5233
 5234        return True
 5235
 5236    def annotation_snpeff(self, threads: int = None) -> None:
 5237        """
 5238        This function annotate with snpEff
 5239
 5240        :param threads: The number of threads to use
 5241        :return: the value of the variable "return_value".
 5242        """
 5243
 5244        # DEBUG
 5245        log.debug("Start annotation with snpeff databases")
 5246
 5247        # Threads
 5248        if not threads:
 5249            threads = self.get_threads()
 5250        log.debug("Threads: " + str(threads))
 5251
 5252        # DEBUG
 5253        delete_tmp = True
 5254        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5255            delete_tmp = False
 5256            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5257
 5258        # Config
 5259        config = self.get_config()
 5260        log.debug("Config: " + str(config))
 5261
 5262        # Config - Folders - Databases
 5263        databases_folders = (
 5264            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
 5265        )
 5266        log.debug("Databases annotations: " + str(databases_folders))
 5267
 5268        # Config - snpEff bin command
 5269        snpeff_bin_command = get_bin_command(
 5270            bin="snpEff.jar",
 5271            tool="snpeff",
 5272            bin_type="jar",
 5273            config=config,
 5274            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 5275        )
 5276        if not snpeff_bin_command:
 5277            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
 5278            log.error(msg_err)
 5279            raise ValueError(msg_err)
 5280
 5281        # Config - snpEff databases
 5282        snpeff_databases = (
 5283            config.get("folders", {})
 5284            .get("databases", {})
 5285            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
 5286        )
 5287        snpeff_databases = full_path(snpeff_databases)
 5288        if snpeff_databases is not None and snpeff_databases != "":
 5289            log.debug(f"Create snpEff databases folder")
 5290            if not os.path.exists(snpeff_databases):
 5291                os.makedirs(snpeff_databases)
 5292
 5293        # Param
 5294        param = self.get_param()
 5295        log.debug("Param: " + str(param))
 5296
 5297        # Param
 5298        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
 5299        log.debug("Options: " + str(options))
 5300
 5301        # Param - Assembly
 5302        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5303
 5304        # Param - Options
 5305        snpeff_options = (
 5306            param.get("annotation", {}).get("snpeff", {}).get("options", "")
 5307        )
 5308        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
 5309        snpeff_csvstats = (
 5310            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
 5311        )
 5312        if snpeff_stats:
 5313            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
 5314            snpeff_stats = full_path(snpeff_stats)
 5315            snpeff_options += f" -stats {snpeff_stats}"
 5316        if snpeff_csvstats:
 5317            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
 5318            snpeff_csvstats = full_path(snpeff_csvstats)
 5319            snpeff_options += f" -csvStats {snpeff_csvstats}"
 5320
 5321        # Data
 5322        table_variants = self.get_table_variants()
 5323
 5324        # Check if not empty
 5325        log.debug("Check if not empty")
 5326        sql_query_chromosomes = (
 5327            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5328        )
 5329        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
 5330        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 5331            log.info(f"VCF empty")
 5332            return
 5333
 5334        # Export in VCF
 5335        log.debug("Create initial file to annotate")
 5336        tmp_vcf = NamedTemporaryFile(
 5337            prefix=self.get_prefix(),
 5338            dir=self.get_tmp_dir(),
 5339            suffix=".vcf.gz",
 5340            delete=True,
 5341        )
 5342        tmp_vcf_name = tmp_vcf.name
 5343
 5344        # VCF header
 5345        vcf_reader = self.get_header()
 5346        log.debug("Initial header: " + str(vcf_reader.infos))
 5347
 5348        # Existing annotations
 5349        for vcf_annotation in self.get_header().infos:
 5350
 5351            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5352            log.debug(
 5353                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5354            )
 5355
 5356        # Memory limit
 5357        # if config.get("memory", None):
 5358        #     memory_limit = config.get("memory", "8G")
 5359        # else:
 5360        #     memory_limit = "8G"
 5361        memory_limit = self.get_memory("8G")
 5362        log.debug(f"memory_limit: {memory_limit}")
 5363
 5364        # snpEff java options
 5365        snpeff_java_options = (
 5366            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 5367        )
 5368        log.debug(f"Exomiser java options: {snpeff_java_options}")
 5369
 5370        force_update_annotation = True
 5371
 5372        if "ANN" not in self.get_header().infos or force_update_annotation:
 5373
 5374            # Check snpEff database
 5375            log.debug(f"Check snpEff databases {[assembly]}")
 5376            databases_download_snpeff(
 5377                folder=snpeff_databases, assemblies=[assembly], config=config
 5378            )
 5379
 5380            # Export VCF file
 5381            self.export_variant_vcf(
 5382                vcf_file=tmp_vcf_name,
 5383                remove_info=True,
 5384                add_samples=False,
 5385                index=True,
 5386            )
 5387
 5388            # Tmp file
 5389            err_files = []
 5390            tmp_annotate_vcf = NamedTemporaryFile(
 5391                prefix=self.get_prefix(),
 5392                dir=self.get_tmp_dir(),
 5393                suffix=".vcf",
 5394                delete=False,
 5395            )
 5396            tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5397            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5398            err_files.append(tmp_annotate_vcf_name_err)
 5399
 5400            # Command
 5401            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
 5402            log.debug(f"Annotation - snpEff command: {snpeff_command}")
 5403            run_parallel_commands([snpeff_command], 1)
 5404
 5405            # Error messages
 5406            log.info(f"Error/Warning messages:")
 5407            error_message_command_all = []
 5408            error_message_command_warning = []
 5409            error_message_command_err = []
 5410            for err_file in err_files:
 5411                with open(err_file, "r") as f:
 5412                    for line in f:
 5413                        message = line.strip()
 5414                        error_message_command_all.append(message)
 5415                        if line.startswith("[W::"):
 5416                            error_message_command_warning.append(message)
 5417                        if line.startswith("[E::"):
 5418                            error_message_command_err.append(f"{err_file}: " + message)
 5419            # log info
 5420            for message in list(
 5421                set(error_message_command_err + error_message_command_warning)
 5422            ):
 5423                log.info(f"   {message}")
 5424            # debug info
 5425            for message in list(set(error_message_command_all)):
 5426                log.debug(f"   {message}")
 5427            # failed
 5428            if len(error_message_command_err):
 5429                log.error("Annotation failed: Error in commands")
 5430                raise ValueError("Annotation failed: Error in commands")
 5431
 5432            # Find annotation in header
 5433            with open(tmp_annotate_vcf_name, "rt") as f:
 5434                header_list = self.read_vcf_header(f)
 5435            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5436
 5437            for ann in annovar_vcf_header.infos:
 5438                if ann not in self.get_header().infos:
 5439                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5440
 5441            # Update variants
 5442            log.info(f"Annotation - Updating...")
 5443            self.update_from_vcf(tmp_annotate_vcf_name)
 5444
 5445        else:
 5446            if "ANN" in self.get_header().infos:
 5447                log.debug(f"Existing snpEff annotations in VCF")
 5448            if force_update_annotation:
 5449                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
 5450
 5451    def annotation_annovar(self, threads: int = None) -> None:
 5452        """
 5453        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
 5454        annotations
 5455
 5456        :param threads: number of threads to use
 5457        :return: the value of the variable "return_value".
 5458        """
 5459
 5460        # DEBUG
 5461        log.debug("Start annotation with Annovar databases")
 5462
 5463        # Threads
 5464        if not threads:
 5465            threads = self.get_threads()
 5466        log.debug("Threads: " + str(threads))
 5467
 5468        # Tmp en Err files
 5469        tmp_files = []
 5470        err_files = []
 5471
 5472        # DEBUG
 5473        delete_tmp = True
 5474        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5475            delete_tmp = False
 5476            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5477
 5478        # Config
 5479        config = self.get_config()
 5480        log.debug("Config: " + str(config))
 5481
 5482        # Config - Folders - Databases
 5483        databases_folders = (
 5484            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
 5485        )
 5486        log.debug("Databases annotations: " + str(databases_folders))
 5487
 5488        # Config - annovar bin command
 5489        annovar_bin_command = get_bin_command(
 5490            bin="table_annovar.pl",
 5491            tool="annovar",
 5492            bin_type="perl",
 5493            config=config,
 5494            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
 5495        )
 5496        if not annovar_bin_command:
 5497            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
 5498            log.error(msg_err)
 5499            raise ValueError(msg_err)
 5500
 5501        # Config - BCFTools bin command
 5502        bcftools_bin_command = get_bin_command(
 5503            bin="bcftools",
 5504            tool="bcftools",
 5505            bin_type="bin",
 5506            config=config,
 5507            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 5508        )
 5509        if not bcftools_bin_command:
 5510            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 5511            log.error(msg_err)
 5512            raise ValueError(msg_err)
 5513
 5514        # Config - annovar databases
 5515        annovar_databases = (
 5516            config.get("folders", {})
 5517            .get("databases", {})
 5518            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
 5519        )
 5520        if annovar_databases is not None:
 5521            if isinstance(annovar_databases, list):
 5522                annovar_databases = full_path(annovar_databases[0])
 5523                log.warning(f"Annovar databases folder '{annovar_databases}' selected")
 5524            annovar_databases = full_path(annovar_databases)
 5525            if not os.path.exists(annovar_databases):
 5526                log.info(f"Annovar databases folder '{annovar_databases}' created")
 5527                Path(annovar_databases).mkdir(parents=True, exist_ok=True)
 5528        else:
 5529            msg_err = f"Annovar databases configuration failed"
 5530            log.error(msg_err)
 5531            raise ValueError(msg_err)
 5532
 5533        # Param
 5534        param = self.get_param()
 5535        log.debug("Param: " + str(param))
 5536
 5537        # Param - options
 5538        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
 5539        log.debug("Options: " + str(options))
 5540
 5541        # Param - annotations
 5542        annotations = (
 5543            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
 5544        )
 5545        log.debug("Annotations: " + str(annotations))
 5546
 5547        # Param - Assembly
 5548        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5549
 5550        # Annovar database assembly
 5551        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
 5552        if annovar_databases_assembly != "" and not os.path.exists(
 5553            annovar_databases_assembly
 5554        ):
 5555            os.makedirs(annovar_databases_assembly)
 5556
 5557        # Data
 5558        table_variants = self.get_table_variants()
 5559
 5560        # Check if not empty
 5561        log.debug("Check if not empty")
 5562        sql_query_chromosomes = (
 5563            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5564        )
 5565        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 5566        if not sql_query_chromosomes_df["count"][0]:
 5567            log.info(f"VCF empty")
 5568            return
 5569
 5570        # VCF header
 5571        vcf_reader = self.get_header()
 5572        log.debug("Initial header: " + str(vcf_reader.infos))
 5573
 5574        # Existing annotations
 5575        for vcf_annotation in self.get_header().infos:
 5576
 5577            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5578            log.debug(
 5579                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5580            )
 5581
 5582        force_update_annotation = True
 5583
 5584        if annotations:
 5585
 5586            commands = []
 5587            tmp_annotates_vcf_name_list = []
 5588
 5589            # Export in VCF
 5590            log.debug("Create initial file to annotate")
 5591            tmp_vcf = NamedTemporaryFile(
 5592                prefix=self.get_prefix(),
 5593                dir=self.get_tmp_dir(),
 5594                suffix=".vcf.gz",
 5595                delete=False,
 5596            )
 5597            tmp_vcf_name = tmp_vcf.name
 5598            tmp_files.append(tmp_vcf_name)
 5599            tmp_files.append(tmp_vcf_name + ".tbi")
 5600
 5601            # Export VCF file
 5602            self.export_variant_vcf(
 5603                vcf_file=tmp_vcf_name,
 5604                remove_info=".",
 5605                add_samples=False,
 5606                index=True,
 5607            )
 5608
 5609            # Create file for field rename
 5610            log.debug("Create file for field rename")
 5611            tmp_rename = NamedTemporaryFile(
 5612                prefix=self.get_prefix(),
 5613                dir=self.get_tmp_dir(),
 5614                suffix=".rename",
 5615                delete=False,
 5616            )
 5617            tmp_rename_name = tmp_rename.name
 5618            tmp_files.append(tmp_rename_name)
 5619
 5620            # Check Annovar database
 5621            log.debug(
 5622                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
 5623            )
 5624            databases_download_annovar(
 5625                folder=annovar_databases,
 5626                files=list(annotations.keys()),
 5627                assemblies=[assembly],
 5628            )
 5629
 5630            for annotation in annotations:
 5631                annotation_fields = annotations[annotation]
 5632
 5633                if not annotation_fields:
 5634                    annotation_fields = {"INFO": None}
 5635
 5636                log.info(f"Annotations Annovar - database '{annotation}'")
 5637                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
 5638
 5639                # Tmp file for annovar
 5640                err_files = []
 5641                tmp_annotate_vcf_directory = TemporaryDirectory(
 5642                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
 5643                )
 5644                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
 5645                tmp_annotate_vcf_name_annovar = (
 5646                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
 5647                )
 5648                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
 5649                err_files.append(tmp_annotate_vcf_name_err)
 5650                tmp_files.append(tmp_annotate_vcf_name_err)
 5651
 5652                # Tmp file final vcf annotated by annovar
 5653                tmp_annotate_vcf = NamedTemporaryFile(
 5654                    prefix=self.get_prefix(),
 5655                    dir=self.get_tmp_dir(),
 5656                    suffix=".vcf.gz",
 5657                    delete=False,
 5658                )
 5659                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5660                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
 5661                tmp_files.append(tmp_annotate_vcf_name)
 5662                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
 5663
 5664                # Number of fields
 5665                annotation_list = []
 5666                annotation_renamed_list = []
 5667
 5668                for annotation_field in annotation_fields:
 5669
 5670                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 5671                    annotation_fields_new_name = annotation_fields.get(
 5672                        annotation_field, annotation_field
 5673                    )
 5674                    if not annotation_fields_new_name:
 5675                        annotation_fields_new_name = annotation_field
 5676
 5677                    if (
 5678                        force_update_annotation
 5679                        or annotation_fields_new_name not in self.get_header().infos
 5680                    ):
 5681                        annotation_list.append(annotation_field)
 5682                        annotation_renamed_list.append(annotation_fields_new_name)
 5683                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
 5684                        log.warning(
 5685                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 5686                        )
 5687
 5688                    # Add rename info
 5689                    run_parallel_commands(
 5690                        [
 5691                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
 5692                        ],
 5693                        1,
 5694                    )
 5695
 5696                # log.debug("fields_to_removed: " + str(fields_to_removed))
 5697                log.debug("annotation_list: " + str(annotation_list))
 5698
 5699                # protocol
 5700                protocol = annotation
 5701
 5702                # argument
 5703                argument = ""
 5704
 5705                # operation
 5706                operation = "f"
 5707                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
 5708                    "ensGene"
 5709                ):
 5710                    operation = "g"
 5711                    if options.get("genebase", None):
 5712                        argument = f"""'{options.get("genebase","")}'"""
 5713                elif annotation in ["cytoBand"]:
 5714                    operation = "r"
 5715
 5716                # argument option
 5717                argument_option = ""
 5718                if argument != "":
 5719                    argument_option = " --argument " + argument
 5720
 5721                # command options
 5722                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
 5723                for option in options:
 5724                    if option not in ["genebase"]:
 5725                        command_options += f""" --{option}={options[option]}"""
 5726
 5727                # Command
 5728
 5729                # Command - Annovar
 5730                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
 5731                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
 5732
 5733                # Command - start pipe
 5734                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
 5735
 5736                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
 5737                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
 5738
 5739                # Command - Special characters (refGene annotation)
 5740                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
 5741
 5742                # Command - Clean empty fields (with value ".")
 5743                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
 5744
 5745                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
 5746                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
 5747                if "ALL" not in annotation_list and "INFO" not in annotation_list:
 5748                    # for ann in annotation_renamed_list:
 5749                    for ann in annotation_list:
 5750                        annovar_fields_to_keep.append(f"^INFO/{ann}")
 5751
 5752                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
 5753
 5754                # Command - indexing
 5755                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
 5756
 5757                log.debug(f"Annotation - Annovar command: {command_annovar}")
 5758                run_parallel_commands([command_annovar], 1)
 5759
 5760                # Error messages
 5761                log.info(f"Error/Warning messages:")
 5762                error_message_command_all = []
 5763                error_message_command_warning = []
 5764                error_message_command_err = []
 5765                for err_file in err_files:
 5766                    with open(err_file, "r") as f:
 5767                        for line in f:
 5768                            message = line.strip()
 5769                            error_message_command_all.append(message)
 5770                            if line.startswith("[W::") or line.startswith("WARNING"):
 5771                                error_message_command_warning.append(message)
 5772                            if line.startswith("[E::") or line.startswith("ERROR"):
 5773                                error_message_command_err.append(
 5774                                    f"{err_file}: " + message
 5775                                )
 5776                # log info
 5777                for message in list(
 5778                    set(error_message_command_err + error_message_command_warning)
 5779                ):
 5780                    log.info(f"   {message}")
 5781                # debug info
 5782                for message in list(set(error_message_command_all)):
 5783                    log.debug(f"   {message}")
 5784                # failed
 5785                if len(error_message_command_err):
 5786                    log.error("Annotation failed: Error in commands")
 5787                    raise ValueError("Annotation failed: Error in commands")
 5788
 5789            if tmp_annotates_vcf_name_list:
 5790
 5791                # List of annotated files
 5792                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
 5793
 5794                # Tmp file
 5795                tmp_annotate_vcf = NamedTemporaryFile(
 5796                    prefix=self.get_prefix(),
 5797                    dir=self.get_tmp_dir(),
 5798                    suffix=".vcf.gz",
 5799                    delete=False,
 5800                )
 5801                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5802                tmp_files.append(tmp_annotate_vcf_name)
 5803                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5804                err_files.append(tmp_annotate_vcf_name_err)
 5805                tmp_files.append(tmp_annotate_vcf_name_err)
 5806
 5807                # Command merge
 5808                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
 5809                log.info(
 5810                    f"Annotation Annovar - Annotation merging "
 5811                    + str(len(tmp_annotates_vcf_name_list))
 5812                    + " annotated files"
 5813                )
 5814                log.debug(f"Annotation - merge command: {merge_command}")
 5815                run_parallel_commands([merge_command], 1)
 5816
 5817                # Find annotation in header
 5818                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
 5819                    header_list = self.read_vcf_header(f)
 5820                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5821
 5822                for ann in annovar_vcf_header.infos:
 5823                    if ann not in self.get_header().infos:
 5824                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5825
 5826                # Update variants
 5827                log.info(f"Annotation Annovar - Updating...")
 5828                self.update_from_vcf(tmp_annotate_vcf_name)
 5829
 5830            # Clean files
 5831            # Tmp file remove command
 5832            if True:
 5833                tmp_files_remove_command = ""
 5834                if tmp_files:
 5835                    tmp_files_remove_command = " ".join(tmp_files)
 5836                clean_command = f" rm -f {tmp_files_remove_command} "
 5837                log.debug(f"Annotation Annovar - Annotation cleaning ")
 5838                log.debug(f"Annotation - cleaning command: {clean_command}")
 5839                run_parallel_commands([clean_command], 1)
 5840
 5841    # Parquet
 5842    def annotation_parquet(self, threads: int = None) -> None:
 5843        """
 5844        It takes a VCF file, and annotates it with a parquet file
 5845
 5846        :param threads: number of threads to use for the annotation
 5847        :return: the value of the variable "result".
 5848        """
 5849
 5850        # DEBUG
 5851        log.debug("Start annotation with parquet databases")
 5852
 5853        # Threads
 5854        if not threads:
 5855            threads = self.get_threads()
 5856        log.debug("Threads: " + str(threads))
 5857
 5858        # DEBUG
 5859        delete_tmp = True
 5860        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5861            delete_tmp = False
 5862            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5863
 5864        # Config
 5865        databases_folders = set(
 5866            self.get_config()
 5867            .get("folders", {})
 5868            .get("databases", {})
 5869            .get("annotations", ["."])
 5870            + self.get_config()
 5871            .get("folders", {})
 5872            .get("databases", {})
 5873            .get("parquet", ["."])
 5874        )
 5875        log.debug("Databases annotations: " + str(databases_folders))
 5876
 5877        # Param
 5878        annotations = (
 5879            self.get_param()
 5880            .get("annotation", {})
 5881            .get("parquet", {})
 5882            .get("annotations", None)
 5883        )
 5884        log.debug("Annotations: " + str(annotations))
 5885
 5886        # Assembly
 5887        assembly = self.get_param().get(
 5888            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 5889        )
 5890
 5891        # Force Update Annotation
 5892        force_update_annotation = (
 5893            self.get_param()
 5894            .get("annotation", {})
 5895            .get("options", {})
 5896            .get("annotations_update", False)
 5897        )
 5898        log.debug(f"force_update_annotation={force_update_annotation}")
 5899        force_append_annotation = (
 5900            self.get_param()
 5901            .get("annotation", {})
 5902            .get("options", {})
 5903            .get("annotations_append", False)
 5904        )
 5905        log.debug(f"force_append_annotation={force_append_annotation}")
 5906
 5907        # Data
 5908        table_variants = self.get_table_variants()
 5909
 5910        # Check if not empty
 5911        log.debug("Check if not empty")
 5912        sql_query_chromosomes_df = self.get_query_to_df(
 5913            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
 5914        )
 5915        if not sql_query_chromosomes_df["count"][0]:
 5916            log.info(f"VCF empty")
 5917            return
 5918
 5919        # VCF header
 5920        vcf_reader = self.get_header()
 5921        log.debug("Initial header: " + str(vcf_reader.infos))
 5922
 5923        # Nb Variants POS
 5924        log.debug("NB Variants Start")
 5925        nb_variants = self.conn.execute(
 5926            f"SELECT count(*) AS count FROM variants"
 5927        ).fetchdf()["count"][0]
 5928        log.debug("NB Variants Stop")
 5929
 5930        # Existing annotations
 5931        for vcf_annotation in self.get_header().infos:
 5932
 5933            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5934            log.debug(
 5935                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5936            )
 5937
 5938        # Added columns
 5939        added_columns = []
 5940
 5941        # drop indexes
 5942        log.debug(f"Drop indexes...")
 5943        self.drop_indexes()
 5944
 5945        if annotations:
 5946
 5947            if "ALL" in annotations:
 5948
 5949                all_param = annotations.get("ALL", {})
 5950                all_param_formats = all_param.get("formats", None)
 5951                all_param_releases = all_param.get("releases", None)
 5952
 5953                databases_infos_dict = self.scan_databases(
 5954                    database_formats=all_param_formats,
 5955                    database_releases=all_param_releases,
 5956                )
 5957                for database_infos in databases_infos_dict.keys():
 5958                    if database_infos not in annotations:
 5959                        annotations[database_infos] = {"INFO": None}
 5960
 5961            for annotation in annotations:
 5962
 5963                if annotation in ["ALL"]:
 5964                    continue
 5965
 5966                # Annotation Name
 5967                annotation_name = os.path.basename(annotation)
 5968
 5969                # Annotation fields
 5970                annotation_fields = annotations[annotation]
 5971                if not annotation_fields:
 5972                    annotation_fields = {"INFO": None}
 5973
 5974                log.debug(f"Annotation '{annotation_name}'")
 5975                log.debug(
 5976                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 5977                )
 5978
 5979                # Create Database
 5980                database = Database(
 5981                    database=annotation,
 5982                    databases_folders=databases_folders,
 5983                    assembly=assembly,
 5984                )
 5985
 5986                # Find files
 5987                parquet_file = database.get_database()
 5988                parquet_hdr_file = database.get_header_file()
 5989                parquet_type = database.get_type()
 5990
 5991                # Check if files exists
 5992                if not parquet_file or not parquet_hdr_file:
 5993                    msg_err_list = []
 5994                    if not parquet_file:
 5995                        msg_err_list.append(
 5996                            f"Annotation failed: Annotation file not found"
 5997                        )
 5998                    if parquet_file and not parquet_hdr_file:
 5999                        msg_err_list.append(
 6000                            f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'"
 6001                        )
 6002
 6003                    log.error(". ".join(msg_err_list))
 6004                    raise ValueError(". ".join(msg_err_list))
 6005                else:
 6006                    # Get parquet connexion
 6007                    parquet_sql_attach = database.get_sql_database_attach(
 6008                        output="query"
 6009                    )
 6010                    if parquet_sql_attach:
 6011                        self.conn.execute(parquet_sql_attach)
 6012                    parquet_file_link = database.get_sql_database_link()
 6013                    # Log
 6014                    log.debug(
 6015                        f"Annotation '{annotation_name}' - file: "
 6016                        + str(parquet_file)
 6017                        + " and "
 6018                        + str(parquet_hdr_file)
 6019                    )
 6020
 6021                    # Database full header columns
 6022                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
 6023                        parquet_hdr_file
 6024                    )
 6025                    # Log
 6026                    log.debug(
 6027                        "Annotation database header columns : "
 6028                        + str(parquet_hdr_vcf_header_columns)
 6029                    )
 6030
 6031                    # Load header as VCF object
 6032                    parquet_hdr_vcf_header_infos = database.get_header().infos
 6033                    # Log
 6034                    log.debug(
 6035                        "Annotation database header: "
 6036                        + str(parquet_hdr_vcf_header_infos)
 6037                    )
 6038
 6039                    # Get extra infos
 6040                    parquet_columns = database.get_extra_columns()
 6041                    # Log
 6042                    log.debug("Annotation database Columns: " + str(parquet_columns))
 6043
 6044                    # Add extra columns if "ALL" in annotation_fields
 6045                    # if "ALL" in annotation_fields:
 6046                    #     allow_add_extra_column = True
 6047                    if "ALL" in annotation_fields and database.get_extra_columns():
 6048                        for extra_column in database.get_extra_columns():
 6049                            if (
 6050                                extra_column not in annotation_fields
 6051                                and extra_column.replace("INFO/", "")
 6052                                not in parquet_hdr_vcf_header_infos
 6053                            ):
 6054                                parquet_hdr_vcf_header_infos[extra_column] = (
 6055                                    vcf.parser._Info(
 6056                                        extra_column,
 6057                                        ".",
 6058                                        "String",
 6059                                        f"{extra_column} description",
 6060                                        "unknown",
 6061                                        "unknown",
 6062                                        self.code_type_map["String"],
 6063                                    )
 6064                                )
 6065
 6066                    # For all fields in database
 6067                    annotation_fields_all = False
 6068                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 6069                        annotation_fields_all = True
 6070                        annotation_fields = {
 6071                            key: key for key in parquet_hdr_vcf_header_infos
 6072                        }
 6073
 6074                        log.debug(
 6075                            "Annotation database header - All annotations added: "
 6076                            + str(annotation_fields)
 6077                        )
 6078
 6079                    # Init
 6080
 6081                    # List of annotation fields to use
 6082                    sql_query_annotation_update_info_sets = []
 6083
 6084                    # List of annotation to agregate
 6085                    sql_query_annotation_to_agregate = []
 6086
 6087                    # Number of fields
 6088                    nb_annotation_field = 0
 6089
 6090                    # Annotation fields processed
 6091                    annotation_fields_processed = []
 6092
 6093                    # Columns mapping
 6094                    map_columns = database.map_columns(
 6095                        columns=annotation_fields, prefixes=["INFO/"]
 6096                    )
 6097
 6098                    # Query dict for fields to remove (update option)
 6099                    query_dict_remove = {}
 6100
 6101                    # Fetch Anotation fields
 6102                    for annotation_field in annotation_fields:
 6103
 6104                        # annotation_field_column
 6105                        annotation_field_column = map_columns.get(
 6106                            annotation_field, "INFO"
 6107                        )
 6108
 6109                        # field new name, if parametered
 6110                        annotation_fields_new_name = annotation_fields.get(
 6111                            annotation_field, annotation_field
 6112                        )
 6113                        if not annotation_fields_new_name:
 6114                            annotation_fields_new_name = annotation_field
 6115
 6116                        # To annotate
 6117                        # force_update_annotation = True
 6118                        # force_append_annotation = True
 6119                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
 6120                        if annotation_field in parquet_hdr_vcf_header_infos and (
 6121                            force_update_annotation
 6122                            or force_append_annotation
 6123                            or (
 6124                                annotation_fields_new_name
 6125                                not in self.get_header().infos
 6126                            )
 6127                        ):
 6128
 6129                            # Add field to annotation to process list
 6130                            annotation_fields_processed.append(
 6131                                annotation_fields_new_name
 6132                            )
 6133
 6134                            # explode infos for the field
 6135                            annotation_fields_new_name_info_msg = ""
 6136                            if (
 6137                                force_update_annotation
 6138                                and annotation_fields_new_name
 6139                                in self.get_header().infos
 6140                            ):
 6141                                # Remove field from INFO
 6142                                query = f"""
 6143                                    UPDATE {table_variants} as table_variants
 6144                                    SET INFO = REGEXP_REPLACE(
 6145                                                concat(table_variants.INFO,''),
 6146                                                ';*{annotation_fields_new_name}=[^;]*',
 6147                                                ''
 6148                                                )
 6149                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
 6150                                """
 6151                                annotation_fields_new_name_info_msg = " [update]"
 6152                                query_dict_remove[
 6153                                    f"remove 'INFO/{annotation_fields_new_name}'"
 6154                                ] = query
 6155
 6156                            # Sep between fields in INFO
 6157                            nb_annotation_field += 1
 6158                            if nb_annotation_field > 1:
 6159                                annotation_field_sep = ";"
 6160                            else:
 6161                                annotation_field_sep = ""
 6162
 6163                            log.info(
 6164                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
 6165                            )
 6166
 6167                            # Add INFO field to header
 6168                            parquet_hdr_vcf_header_infos_number = (
 6169                                parquet_hdr_vcf_header_infos[annotation_field].num
 6170                                or "."
 6171                            )
 6172                            parquet_hdr_vcf_header_infos_type = (
 6173                                parquet_hdr_vcf_header_infos[annotation_field].type
 6174                                or "String"
 6175                            )
 6176                            parquet_hdr_vcf_header_infos_description = (
 6177                                parquet_hdr_vcf_header_infos[annotation_field].desc
 6178                                or f"{annotation_field} description"
 6179                            )
 6180                            parquet_hdr_vcf_header_infos_source = (
 6181                                parquet_hdr_vcf_header_infos[annotation_field].source
 6182                                or "unknown"
 6183                            )
 6184                            parquet_hdr_vcf_header_infos_version = (
 6185                                parquet_hdr_vcf_header_infos[annotation_field].version
 6186                                or "unknown"
 6187                            )
 6188
 6189                            vcf_reader.infos[annotation_fields_new_name] = (
 6190                                vcf.parser._Info(
 6191                                    annotation_fields_new_name,
 6192                                    parquet_hdr_vcf_header_infos_number,
 6193                                    parquet_hdr_vcf_header_infos_type,
 6194                                    parquet_hdr_vcf_header_infos_description,
 6195                                    parquet_hdr_vcf_header_infos_source,
 6196                                    parquet_hdr_vcf_header_infos_version,
 6197                                    self.code_type_map[
 6198                                        parquet_hdr_vcf_header_infos_type
 6199                                    ],
 6200                                )
 6201                            )
 6202
 6203                            # Append
 6204                            if force_append_annotation:
 6205                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
 6206                            else:
 6207                                query_case_when_append = ""
 6208
 6209                            # Annotation/Update query fields
 6210                            # Found in INFO column
 6211                            if (
 6212                                annotation_field_column == "INFO"
 6213                                and "INFO" in parquet_hdr_vcf_header_columns
 6214                            ):
 6215                                sql_query_annotation_update_info_sets.append(
 6216                                    f"""
 6217                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
 6218                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
 6219                                        ELSE ''
 6220                                    END
 6221                                """
 6222                                )
 6223                            # Found in a specific column
 6224                            else:
 6225                                sql_query_annotation_update_info_sets.append(
 6226                                    f"""
 6227                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
 6228                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
 6229                                        ELSE ''
 6230                                    END
 6231                                """
 6232                                )
 6233                                sql_query_annotation_to_agregate.append(
 6234                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
 6235                                )
 6236
 6237                        # Not to annotate
 6238                        else:
 6239
 6240                            if force_update_annotation:
 6241                                annotation_message = "forced"
 6242                            else:
 6243                                annotation_message = "skipped"
 6244
 6245                            if annotation_field not in parquet_hdr_vcf_header_infos:
 6246                                log.warning(
 6247                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
 6248                                )
 6249                            if annotation_fields_new_name in self.get_header().infos:
 6250                                log.warning(
 6251                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
 6252                                )
 6253
 6254                    # Check if ALL fields have to be annotated. Thus concat all INFO field
 6255                    # allow_annotation_full_info = True
 6256                    allow_annotation_full_info = not force_append_annotation
 6257
 6258                    if parquet_type in ["regions"]:
 6259                        allow_annotation_full_info = False
 6260
 6261                    if (
 6262                        allow_annotation_full_info
 6263                        and nb_annotation_field == len(annotation_fields)
 6264                        and annotation_fields_all
 6265                        and (
 6266                            "INFO" in parquet_hdr_vcf_header_columns
 6267                            and "INFO" in database.get_extra_columns()
 6268                        )
 6269                    ):
 6270                        log.debug("Column INFO annotation enabled")
 6271                        sql_query_annotation_update_info_sets = []
 6272                        sql_query_annotation_update_info_sets.append(
 6273                            f" table_parquet.INFO "
 6274                        )
 6275
 6276                    if sql_query_annotation_update_info_sets:
 6277
 6278                        # Annotate
 6279                        log.info(f"Annotation '{annotation_name}' - Annotation...")
 6280
 6281                        # Join query annotation update info sets for SQL
 6282                        sql_query_annotation_update_info_sets_sql = ",".join(
 6283                            sql_query_annotation_update_info_sets
 6284                        )
 6285
 6286                        # Check chromosomes list (and variants infos)
 6287                        sql_query_chromosomes = f"""
 6288                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
 6289                            FROM {table_variants} as table_variants
 6290                            GROUP BY table_variants."#CHROM"
 6291                            ORDER BY table_variants."#CHROM"
 6292                            """
 6293                        sql_query_chromosomes_df = self.conn.execute(
 6294                            sql_query_chromosomes
 6295                        ).df()
 6296                        sql_query_chromosomes_dict = {
 6297                            entry["CHROM"]: {
 6298                                "count": entry["count_variants"],
 6299                                "min": entry["min_variants"],
 6300                                "max": entry["max_variants"],
 6301                            }
 6302                            for index, entry in sql_query_chromosomes_df.iterrows()
 6303                        }
 6304
 6305                        # Init
 6306                        nb_of_query = 0
 6307                        nb_of_variant_annotated = 0
 6308                        query_dict = query_dict_remove
 6309
 6310                        # for chrom in sql_query_chromosomes_df["CHROM"]:
 6311                        for chrom in sql_query_chromosomes_dict:
 6312
 6313                            # Number of variant by chromosome
 6314                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
 6315                                chrom, {}
 6316                            ).get("count", 0)
 6317
 6318                            log.debug(
 6319                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
 6320                            )
 6321
 6322                            # Annotation with regions database
 6323                            if parquet_type in ["regions"]:
 6324                                sql_query_annotation_from_clause = f"""
 6325                                    FROM (
 6326                                        SELECT 
 6327                                            '{chrom}' AS \"#CHROM\",
 6328                                            table_variants_from.\"POS\" AS \"POS\",
 6329                                            {",".join(sql_query_annotation_to_agregate)}
 6330                                        FROM {table_variants} as table_variants_from
 6331                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
 6332                                            table_parquet_from."#CHROM" = '{chrom}'
 6333                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
 6334                                            AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
 6335                                        )
 6336                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
 6337                                        GROUP BY table_variants_from.\"POS\"
 6338                                        )
 6339                                        as table_parquet
 6340                                """
 6341
 6342                                sql_query_annotation_where_clause = """
 6343                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
 6344                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6345                                """
 6346
 6347                            # Annotation with variants database
 6348                            else:
 6349                                sql_query_annotation_from_clause = f"""
 6350                                    FROM {parquet_file_link} as table_parquet
 6351                                """
 6352                                sql_query_annotation_where_clause = f"""
 6353                                    table_variants."#CHROM" = '{chrom}'
 6354                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
 6355                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6356                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 6357                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 6358                                """
 6359
 6360                            # Create update query
 6361                            sql_query_annotation_chrom_interval_pos = f"""
 6362                                UPDATE {table_variants} as table_variants
 6363                                    SET INFO = 
 6364                                        concat(
 6365                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6366                                                THEN table_variants.INFO
 6367                                                ELSE ''
 6368                                            END
 6369                                            ,
 6370                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6371                                                        AND (
 6372                                                        concat({sql_query_annotation_update_info_sets_sql})
 6373                                                        )
 6374                                                        NOT IN ('','.') 
 6375                                                    THEN ';'
 6376                                                    ELSE ''
 6377                                            END
 6378                                            ,
 6379                                            {sql_query_annotation_update_info_sets_sql}
 6380                                            )
 6381                                    {sql_query_annotation_from_clause}
 6382                                    WHERE {sql_query_annotation_where_clause}
 6383                                    ;
 6384                                """
 6385
 6386                            # Add update query to dict
 6387                            query_dict[
 6388                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
 6389                            ] = sql_query_annotation_chrom_interval_pos
 6390
 6391                        nb_of_query = len(query_dict)
 6392                        num_query = 0
 6393
 6394                        # SET max_expression_depth TO x
 6395                        self.conn.execute("SET max_expression_depth TO 10000")
 6396
 6397                        for query_name in query_dict:
 6398                            query = query_dict[query_name]
 6399                            num_query += 1
 6400                            log.info(
 6401                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
 6402                            )
 6403                            result = self.conn.execute(query)
 6404                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
 6405                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
 6406                            log.info(
 6407                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
 6408                            )
 6409
 6410                        log.info(
 6411                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
 6412                        )
 6413
 6414                    else:
 6415
 6416                        log.info(
 6417                            f"Annotation '{annotation_name}' - No Annotations available"
 6418                        )
 6419
 6420                    log.debug("Final header: " + str(vcf_reader.infos))
 6421
 6422        # Remove added columns
 6423        for added_column in added_columns:
 6424            self.drop_column(column=added_column)
 6425
 6426    def annotation_splice(self, threads: int = None) -> None:
 6427        """
 6428        This function annotate with snpEff
 6429
 6430        :param threads: The number of threads to use
 6431        :return: the value of the variable "return_value".
 6432        """
 6433
 6434        # DEBUG
 6435        log.debug("Start annotation with splice tools")
 6436
 6437        # Threads
 6438        if not threads:
 6439            threads = self.get_threads()
 6440        log.debug("Threads: " + str(threads))
 6441
 6442        # DEBUG
 6443        delete_tmp = True
 6444        if self.get_config().get("verbosity", "warning") in ["debug"]:
 6445            delete_tmp = False
 6446            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 6447
 6448        # Config
 6449        config = self.get_config()
 6450        log.debug("Config: " + str(config))
 6451        splice_config = config.get("tools", {}).get("splice", {})
 6452        if not splice_config:
 6453            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
 6454            msg_err = "No Splice tool config"
 6455            raise ValueError(msg_err)
 6456        log.debug(f"splice_config: {splice_config}")
 6457
 6458        # Config - Folders - Databases
 6459        databases_folders = (
 6460            config.get("folders", {}).get("databases", {}).get("splice", ["."])
 6461        )
 6462        log.debug("Databases annotations: " + str(databases_folders))
 6463
 6464        # Splice docker image
 6465        splice_docker_image = splice_config.get("docker").get("image")
 6466
 6467        # Pull splice image if it's not already there
 6468        if not check_docker_image_exists(splice_docker_image):
 6469            log.warning(
 6470                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
 6471            )
 6472            try:
 6473                command(f"docker pull {splice_config.get('docker').get('image')}")
 6474            except subprocess.CalledProcessError:
 6475                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
 6476                log.error(msg_err)
 6477                raise ValueError(msg_err)
 6478
 6479        # Config - splice databases
 6480        splice_databases = (
 6481            config.get("folders", {})
 6482            .get("databases", {})
 6483            .get("splice", DEFAULT_SPLICE_FOLDER)
 6484        )
 6485        splice_databases = full_path(splice_databases)
 6486
 6487        # Param
 6488        param = self.get_param()
 6489        log.debug("Param: " + str(param))
 6490
 6491        # Param
 6492        options = param.get("annotation", {}).get("splice", {}).get("options", {})
 6493        log.debug("Options: " + str(options))
 6494
 6495        # Data
 6496        table_variants = self.get_table_variants()
 6497
 6498        # Check if not empty
 6499        log.debug("Check if not empty")
 6500        sql_query_chromosomes = (
 6501            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 6502        )
 6503        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 6504            log.info("VCF empty")
 6505            return None
 6506
 6507        # Export in VCF
 6508        log.debug("Create initial file to annotate")
 6509
 6510        # Create output folder / work folder
 6511        if options.get("output_folder", ""):
 6512            output_folder = options.get("output_folder", "")
 6513            if not os.path.exists(output_folder):
 6514                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6515        else:
 6516            output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
 6517            if not os.path.exists(output_folder):
 6518                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6519
 6520        if options.get("workdir", ""):
 6521            workdir = options.get("workdir", "")
 6522        else:
 6523            workdir = "/work"
 6524
 6525        # Create tmp VCF file
 6526        tmp_vcf = NamedTemporaryFile(
 6527            prefix=self.get_prefix(),
 6528            dir=output_folder,
 6529            suffix=".vcf",
 6530            delete=False,
 6531        )
 6532        tmp_vcf_name = tmp_vcf.name
 6533
 6534        # VCF header
 6535        header = self.get_header()
 6536
 6537        # Existing annotations
 6538        for vcf_annotation in self.get_header().infos:
 6539
 6540            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 6541            log.debug(
 6542                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 6543            )
 6544
 6545        # Memory limit
 6546        if config.get("memory", None):
 6547            memory_limit = config.get("memory", "8G").upper()
 6548            # upper()
 6549        else:
 6550            memory_limit = "8G"
 6551        log.debug(f"memory_limit: {memory_limit}")
 6552
 6553        # Check number of variants to annotate
 6554        where_clause_regex_spliceai = r"SpliceAI_\w+"
 6555        where_clause_regex_spip = r"SPiP_\w+"
 6556        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
 6557        df_list_of_variants_to_annotate = self.get_query_to_df(
 6558            query=f""" SELECT * FROM variants {where_clause} """
 6559        )
 6560        if len(df_list_of_variants_to_annotate) == 0:
 6561            log.warning(
 6562                f"No variants to annotate with splice. Variants probably already annotated with splice"
 6563            )
 6564            return None
 6565        else:
 6566            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
 6567
 6568        # Export VCF file
 6569        self.export_variant_vcf(
 6570            vcf_file=tmp_vcf_name,
 6571            remove_info=True,
 6572            add_samples=True,
 6573            index=False,
 6574            where_clause=where_clause,
 6575        )
 6576        mount = [f" -v {path}:{path}:rw" for path in [output_folder]]
 6577        if any(value for value in splice_config.values() if value is None):
 6578            log.warning("At least one splice config parameter is empty")
 6579            # exit annotation_splice
 6580            return None
 6581
 6582        # Params in splice nf
 6583        def check_values(dico: dict):
 6584            """
 6585            Ensure parameters for NF splice pipeline
 6586            """
 6587            for key, val in dico.items():
 6588                if key == "genome":
 6589                    if any(
 6590                        assemb in options.get("genome", {})
 6591                        for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
 6592                    ):
 6593                        yield f"--{key} hg19"
 6594                    elif any(
 6595                        assemb in options.get("genome", {})
 6596                        for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
 6597                    ):
 6598                        yield f"--{key} hg38"
 6599                elif (
 6600                    (isinstance(val, str) and val)
 6601                    or isinstance(val, int)
 6602                    or isinstance(val, bool)
 6603                ):
 6604                    yield f"--{key} {val}"
 6605
 6606        # Genome
 6607        genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
 6608        options["genome"] = genome
 6609        # NF params
 6610        nf_params = []
 6611        # Add options
 6612        if options:
 6613            log.debug(options)
 6614            nf_params = list(check_values(options))
 6615            log.debug(f"Splice NF params: {' '.join(nf_params)}")
 6616        else:
 6617            log.debug("No NF params provided")
 6618        # Add threads
 6619        if "threads" not in options.keys():
 6620            nf_params.append(f"--threads {threads}")
 6621        # Genome path
 6622        genome_path = find_genome(
 6623            config.get("folders", {})
 6624            .get("databases", {})
 6625            .get("genomes", DEFAULT_GENOME_FOLDER),
 6626            file=f"{genome}.fa",
 6627        )
 6628        # Add genome path
 6629        if not genome_path:
 6630            raise ValueError(
 6631                f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
 6632            )
 6633        else:
 6634            log.debug(f"Genome: {genome_path}")
 6635            nf_params.append(f"--genome_path {genome_path}")
 6636
 6637        def splice_annotations(options: dict = {}, config: dict = {}) -> list:
 6638            """
 6639            Setting up updated databases for SPiP and SpliceAI
 6640            """
 6641
 6642            try:
 6643
 6644                # SpliceAI assembly transcriptome
 6645                spliceai_assembly = os.path.join(
 6646                    config.get("folders", {}).get("databases", {}).get("spliceai", {}),
 6647                    options.get("genome"),
 6648                    "transcriptome",
 6649                )
 6650                spip_assembly = options.get("genome")
 6651
 6652                spip = find(
 6653                    f"transcriptome_{spip_assembly}.RData",
 6654                    config.get("folders", {}).get("databases", {}).get("spip", {}),
 6655                )
 6656                spliceai = find("spliceai.refseq.txt", spliceai_assembly)
 6657                log.debug(f"SPiP annotations: {spip}")
 6658                log.debug(f"SpliceAI annotations: {spliceai}")
 6659                if spip and spliceai:
 6660                    return [
 6661                        f"--spip_transcriptome {spip}",
 6662                        f"--spliceai_transcriptome {spliceai}",
 6663                    ]
 6664                else:
 6665                    log.warning(
 6666                        "Can't find splice databases in configuration, use annotations file from image"
 6667                    )
 6668            except TypeError:
 6669                log.warning(
 6670                    "Can't find splice databases in configuration, use annotations file from image"
 6671                )
 6672                return []
 6673
 6674        # Add options, check if transcriptome option have already beend provided
 6675        if (
 6676            "spip_transcriptome" not in nf_params
 6677            and "spliceai_transcriptome" not in nf_params
 6678        ):
 6679            splice_reference = splice_annotations(options, config)
 6680            if splice_reference:
 6681                nf_params.extend(splice_reference)
 6682        # nf_params.append(f"--output_folder {output_folder}")
 6683        random_uuid = f"HOWARD-SPLICE-{get_random()}"
 6684        cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
 6685        log.debug(cmd)
 6686        splice_config["docker"]["command"] = cmd
 6687
 6688        # Ensure proxy is set
 6689        proxy = [
 6690            f"-e {var}={os.getenv(var)}"
 6691            for var in ["https_proxy", "http_proxy", "ftp_proxy"]
 6692            if os.getenv(var) is not None
 6693        ]
 6694        docker_cmd = get_bin_command(
 6695            tool="splice",
 6696            bin_type="docker",
 6697            config=config,
 6698            default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
 6699            add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}",
 6700        )
 6701        # print(docker_cmd)
 6702        # exit()
 6703        # Docker debug
 6704        # if splice_config.get("rm_container"):
 6705        #     rm_container = "--rm"
 6706        # else:
 6707        #     rm_container = ""
 6708        # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
 6709        log.debug(docker_cmd)
 6710        res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
 6711        log.debug(res.stdout)
 6712        if res.stderr:
 6713            log.error(res.stderr)
 6714        res.check_returncode()
 6715        # Update variants
 6716        log.info("Annotation - Updating...")
 6717        # Test find output vcf
 6718        log.debug(
 6719            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6720        )
 6721        output_vcf = []
 6722        # Wrong folder to look in
 6723        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
 6724            if (
 6725                files
 6726                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6727            ):
 6728                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
 6729        # log.debug(os.listdir(options.get("output_folder")))
 6730        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
 6731        if not output_vcf:
 6732            log.debug(
 6733                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
 6734            )
 6735        else:
 6736            # Get new header from annotated vcf
 6737            log.debug(f"Initial header: {len(header.infos)} fields")
 6738            # Create new header with splice infos
 6739            new_vcf = Variants(input=output_vcf[0])
 6740            new_vcf_header = new_vcf.get_header().infos
 6741            for keys, infos in new_vcf_header.items():
 6742                if keys not in header.infos.keys():
 6743                    header.infos[keys] = infos
 6744            log.debug(f"New header: {len(header.infos)} fields")
 6745            log.debug(f"Splice tmp output: {output_vcf[0]}")
 6746            self.update_from_vcf(output_vcf[0])
 6747
 6748        # Remove file
 6749        remove_if_exists(output_vcf)
 6750
 6751    ###
 6752    # Prioritization
 6753    ###
 6754
 6755    def get_config_default(self, name: str) -> dict:
 6756        """
 6757        The function `get_config_default` returns a dictionary containing default configurations for
 6758        various calculations and prioritizations.
 6759
 6760        :param name: The `get_config_default` function returns a dictionary containing default
 6761        configurations for different calculations and prioritizations. The `name` parameter is used to
 6762        specify which specific configuration to retrieve from the dictionary
 6763        :type name: str
 6764        :return: The function `get_config_default` returns a dictionary containing default configuration
 6765        settings for different calculations and prioritizations. The specific configuration settings are
 6766        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
 6767        matches a key in the `config_default` dictionary, the corresponding configuration settings are
 6768        returned. If there is no match, an empty dictionary is returned.
 6769        """
 6770
 6771        config_default = {
 6772            "calculations": {
 6773                "variant_chr_pos_alt_ref": {
 6774                    "type": "sql",
 6775                    "name": "variant_chr_pos_alt_ref",
 6776                    "description": "Create a variant ID with chromosome, position, alt and ref",
 6777                    "available": False,
 6778                    "output_column_name": "variant_chr_pos_alt_ref",
 6779                    "output_column_type": "String",
 6780                    "output_column_description": "variant ID with chromosome, position, alt and ref",
 6781                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
 6782                    "operation_info": True,
 6783                },
 6784                "VARTYPE": {
 6785                    "type": "sql",
 6786                    "name": "VARTYPE",
 6787                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
 6788                    "available": True,
 6789                    "table": "variants",
 6790                    "output_column_name": "VARTYPE",
 6791                    "output_column_type": "String",
 6792                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
 6793                    "operation_query": """
 6794                            CASE
 6795                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
 6796                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
 6797                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
 6798                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
 6799                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
 6800                                ELSE 'UNDEFINED'
 6801                            END
 6802                            """,
 6803                    "info_fields": ["SVTYPE"],
 6804                    "operation_info": True,
 6805                },
 6806                "snpeff_hgvs": {
 6807                    "type": "python",
 6808                    "name": "snpeff_hgvs",
 6809                    "description": "HGVS nomenclatures from snpEff annotation",
 6810                    "available": True,
 6811                    "function_name": "calculation_extract_snpeff_hgvs",
 6812                    "function_params": ["snpeff_hgvs", "ANN"],
 6813                },
 6814                "snpeff_ann_explode": {
 6815                    "type": "python",
 6816                    "name": "snpeff_ann_explode",
 6817                    "description": "Explode snpEff annotations with uniquify values",
 6818                    "available": True,
 6819                    "function_name": "calculation_snpeff_ann_explode",
 6820                    "function_params": [False, "fields", "snpeff_", "ANN"],
 6821                },
 6822                "snpeff_ann_explode_uniquify": {
 6823                    "type": "python",
 6824                    "name": "snpeff_ann_explode_uniquify",
 6825                    "description": "Explode snpEff annotations",
 6826                    "available": True,
 6827                    "function_name": "calculation_snpeff_ann_explode",
 6828                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
 6829                },
 6830                "snpeff_ann_explode_json": {
 6831                    "type": "python",
 6832                    "name": "snpeff_ann_explode_json",
 6833                    "description": "Explode snpEff annotations in JSON format",
 6834                    "available": True,
 6835                    "function_name": "calculation_snpeff_ann_explode",
 6836                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
 6837                },
 6838                "NOMEN": {
 6839                    "type": "python",
 6840                    "name": "NOMEN",
 6841                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)",
 6842                    "available": True,
 6843                    "function_name": "calculation_extract_nomen",
 6844                    "function_params": [],
 6845                },
 6846                "RENAME_INFO_FIELDS": {
 6847                    "type": "python",
 6848                    "name": "RENAME_INFO_FIELDS",
 6849                    "description": "Rename or remove INFO/tags",
 6850                    "available": True,
 6851                    "function_name": "calculation_rename_info_fields",
 6852                    "function_params": [],
 6853                },
 6854                "FINDBYPIPELINE": {
 6855                    "type": "python",
 6856                    "name": "FINDBYPIPELINE",
 6857                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
 6858                    "available": True,
 6859                    "function_name": "calculation_find_by_pipeline",
 6860                    "function_params": ["findbypipeline"],
 6861                },
 6862                "FINDBYSAMPLE": {
 6863                    "type": "python",
 6864                    "name": "FINDBYSAMPLE",
 6865                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
 6866                    "available": True,
 6867                    "function_name": "calculation_find_by_pipeline",
 6868                    "function_params": ["findbysample"],
 6869                },
 6870                "GENOTYPECONCORDANCE": {
 6871                    "type": "python",
 6872                    "name": "GENOTYPECONCORDANCE",
 6873                    "description": "Concordance of genotype for multi caller VCF",
 6874                    "available": True,
 6875                    "function_name": "calculation_genotype_concordance",
 6876                    "function_params": [],
 6877                },
 6878                "BARCODE": {
 6879                    "type": "python",
 6880                    "name": "BARCODE",
 6881                    "description": "BARCODE as VaRank tool",
 6882                    "available": True,
 6883                    "function_name": "calculation_barcode",
 6884                    "function_params": [],
 6885                },
 6886                "BARCODEFAMILY": {
 6887                    "type": "python",
 6888                    "name": "BARCODEFAMILY",
 6889                    "description": "BARCODEFAMILY as VaRank tool",
 6890                    "available": True,
 6891                    "function_name": "calculation_barcode_family",
 6892                    "function_params": ["BCF"],
 6893                },
 6894                "TRIO": {
 6895                    "type": "python",
 6896                    "name": "TRIO",
 6897                    "description": "Inheritance for a trio family",
 6898                    "available": True,
 6899                    "function_name": "calculation_trio",
 6900                    "function_params": [],
 6901                },
 6902                "VAF": {
 6903                    "type": "python",
 6904                    "name": "VAF",
 6905                    "description": "Variant Allele Frequency (VAF) harmonization",
 6906                    "available": True,
 6907                    "function_name": "calculation_vaf_normalization",
 6908                    "function_params": [],
 6909                },
 6910                "VAF_stats": {
 6911                    "type": "python",
 6912                    "name": "VAF_stats",
 6913                    "description": "Variant Allele Frequency (VAF) statistics",
 6914                    "available": True,
 6915                    "function_name": "calculation_genotype_stats",
 6916                    "function_params": ["VAF"],
 6917                },
 6918                "DP_stats": {
 6919                    "type": "python",
 6920                    "name": "DP_stats",
 6921                    "description": "Depth (DP) statistics",
 6922                    "available": True,
 6923                    "function_name": "calculation_genotype_stats",
 6924                    "function_params": ["DP"],
 6925                },
 6926                "variant_id": {
 6927                    "type": "python",
 6928                    "name": "variant_id",
 6929                    "description": "Variant ID generated from variant position and type",
 6930                    "available": True,
 6931                    "function_name": "calculation_variant_id",
 6932                    "function_params": [],
 6933                },
 6934                "transcripts_json": {
 6935                    "type": "python",
 6936                    "name": "transcripts_json",
 6937                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
 6938                    "available": True,
 6939                    "function_name": "calculation_transcripts_annotation",
 6940                    "function_params": ["transcripts_json", None],
 6941                },
 6942                "transcripts_ann": {
 6943                    "type": "python",
 6944                    "name": "transcripts_ann",
 6945                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
 6946                    "available": True,
 6947                    "function_name": "calculation_transcripts_annotation",
 6948                    "function_params": [None, "transcripts_ann"],
 6949                },
 6950                "transcripts_annotations": {
 6951                    "type": "python",
 6952                    "name": "transcripts_annotations",
 6953                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
 6954                    "available": True,
 6955                    "function_name": "calculation_transcripts_annotation",
 6956                    "function_params": [None, None],
 6957                },
 6958                "transcripts_prioritization": {
 6959                    "type": "python",
 6960                    "name": "transcripts_prioritization",
 6961                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
 6962                    "available": True,
 6963                    "function_name": "calculation_transcripts_prioritization",
 6964                    "function_params": [],
 6965                },
 6966                "transcripts_export": {
 6967                    "type": "python",
 6968                    "name": "transcripts_export",
 6969                    "description": "Export transcripts table/view as a file (using param.json)",
 6970                    "available": True,
 6971                    "function_name": "calculation_transcripts_export",
 6972                    "function_params": [],
 6973                },
 6974            },
 6975            "prioritizations": {
 6976                "default": {
 6977                    "ANN2": [
 6978                        {
 6979                            "type": "contains",
 6980                            "value": "HIGH",
 6981                            "score": 5,
 6982                            "flag": "PASS",
 6983                            "comment": [
 6984                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
 6985                            ],
 6986                        },
 6987                        {
 6988                            "type": "contains",
 6989                            "value": "MODERATE",
 6990                            "score": 3,
 6991                            "flag": "PASS",
 6992                            "comment": [
 6993                                "A non-disruptive variant that might change protein effectiveness"
 6994                            ],
 6995                        },
 6996                        {
 6997                            "type": "contains",
 6998                            "value": "LOW",
 6999                            "score": 0,
 7000                            "flag": "FILTERED",
 7001                            "comment": [
 7002                                "Assumed to be mostly harmless or unlikely to change protein behavior"
 7003                            ],
 7004                        },
 7005                        {
 7006                            "type": "contains",
 7007                            "value": "MODIFIER",
 7008                            "score": 0,
 7009                            "flag": "FILTERED",
 7010                            "comment": [
 7011                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
 7012                            ],
 7013                        },
 7014                    ],
 7015                }
 7016            },
 7017        }
 7018
 7019        return config_default.get(name, None)
 7020
 7021    def get_config_json(
 7022        self, name: str, config_dict: dict = {}, config_file: str = None
 7023    ) -> dict:
 7024        """
 7025        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
 7026        default values, a dictionary, and a file.
 7027
 7028        :param name: The `name` parameter in the `get_config_json` function is a string that represents
 7029        the name of the configuration. It is used to identify and retrieve the configuration settings
 7030        for a specific component or module
 7031        :type name: str
 7032        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
 7033        dictionary that allows you to provide additional configuration settings or overrides. When you
 7034        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
 7035        the key is the configuration setting you want to override or
 7036        :type config_dict: dict
 7037        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
 7038        specify the path to a configuration file that contains additional settings. If provided, the
 7039        function will read the contents of this file and update the configuration dictionary with the
 7040        values found in the file, overriding any existing values with the
 7041        :type config_file: str
 7042        :return: The function `get_config_json` returns a dictionary containing the configuration
 7043        settings.
 7044        """
 7045
 7046        # Create with default prioritizations
 7047        config_default = self.get_config_default(name=name)
 7048        configuration = config_default
 7049        # log.debug(f"configuration={configuration}")
 7050
 7051        # Replace prioritizations from dict
 7052        for config in config_dict:
 7053            configuration[config] = config_dict[config]
 7054
 7055        # Replace prioritizations from file
 7056        config_file = full_path(config_file)
 7057        if config_file:
 7058            if os.path.exists(config_file):
 7059                with open(config_file) as config_file_content:
 7060                    config_file_dict = yaml.safe_load(config_file_content)
 7061                for config in config_file_dict:
 7062                    configuration[config] = config_file_dict[config]
 7063            else:
 7064                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
 7065                log.error(msg_error)
 7066                raise ValueError(msg_error)
 7067
 7068        return configuration
 7069
 7070    def prioritization(
 7071        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
 7072    ) -> bool:
 7073        """
 7074        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
 7075        prioritizes variants based on configured profiles and criteria.
 7076
 7077        :param table: The `table` parameter in the `prioritization` function is used to specify the name
 7078        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
 7079        a table name is provided, the method will prioritize the variants in that specific table
 7080        :type table: str
 7081        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
 7082        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
 7083        provided, the code will use a default prefix value of "PZ"
 7084        :type pz_prefix: str
 7085        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
 7086        additional parameters specific to the prioritization process. These parameters can include
 7087        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
 7088        configurations needed for the prioritization of variants in a V
 7089        :type pz_param: dict
 7090        :return: A boolean value (True) is being returned from the `prioritization` function.
 7091        """
 7092
 7093        # Config
 7094        config = self.get_config()
 7095
 7096        # Param
 7097        param = self.get_param()
 7098
 7099        # Prioritization param
 7100        if pz_param is not None:
 7101            prioritization_param = pz_param
 7102        else:
 7103            prioritization_param = param.get("prioritization", {})
 7104
 7105        # Configuration profiles
 7106        prioritization_config_file = prioritization_param.get(
 7107            "prioritization_config", None
 7108        )
 7109        prioritization_config_file = full_path(prioritization_config_file)
 7110        prioritizations_config = self.get_config_json(
 7111            name="prioritizations", config_file=prioritization_config_file
 7112        )
 7113
 7114        # Prioritization prefix
 7115        pz_prefix_default = "PZ"
 7116        if pz_prefix is None:
 7117            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
 7118
 7119        # Prioritization options
 7120        profiles = prioritization_param.get("profiles", [])
 7121        if isinstance(profiles, str):
 7122            profiles = profiles.split(",")
 7123        pzfields = prioritization_param.get(
 7124            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
 7125        )
 7126        if isinstance(pzfields, str):
 7127            pzfields = pzfields.split(",")
 7128        default_profile = prioritization_param.get("default_profile", None)
 7129        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
 7130        prioritization_score_mode = prioritization_param.get(
 7131            "prioritization_score_mode", "HOWARD"
 7132        )
 7133
 7134        # Quick Prioritizations
 7135        prioritizations = param.get("prioritizations", None)
 7136        if prioritizations:
 7137            log.info("Quick Prioritization:")
 7138            for profile in prioritizations.split(","):
 7139                if profile not in profiles:
 7140                    profiles.append(profile)
 7141                    log.info(f"   {profile}")
 7142
 7143        # If profile "ALL" provided, all profiles in the config profiles
 7144        if "ALL" in profiles:
 7145            profiles = list(prioritizations_config.keys())
 7146
 7147        for profile in profiles:
 7148            if prioritizations_config.get(profile, None):
 7149                log.debug(f"Profile '{profile}' configured")
 7150            else:
 7151                msg_error = f"Profile '{profile}' NOT configured"
 7152                log.error(msg_error)
 7153                raise ValueError(msg_error)
 7154
 7155        if profiles:
 7156            log.info(f"Prioritization... ")
 7157        else:
 7158            log.debug(f"No profile defined")
 7159            return False
 7160
 7161        if not default_profile and len(profiles):
 7162            default_profile = profiles[0]
 7163
 7164        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
 7165        log.debug("Profiles to check: " + str(list(profiles)))
 7166
 7167        # Variables
 7168        if table is not None:
 7169            table_variants = table
 7170        else:
 7171            table_variants = self.get_table_variants(clause="update")
 7172        log.debug(f"Table to prioritize: {table_variants}")
 7173
 7174        # Added columns
 7175        added_columns = []
 7176
 7177        # Create list of PZfields
 7178        # List of PZFields
 7179        list_of_pzfields_original = pzfields + [
 7180            pzfield + pzfields_sep + profile
 7181            for pzfield in pzfields
 7182            for profile in profiles
 7183        ]
 7184        list_of_pzfields = []
 7185        log.debug(f"{list_of_pzfields_original}")
 7186
 7187        # Remove existing PZfields to use if exists
 7188        for pzfield in list_of_pzfields_original:
 7189            if self.get_header().infos.get(pzfield, None) is None:
 7190                list_of_pzfields.append(pzfield)
 7191                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
 7192            else:
 7193                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
 7194
 7195        if list_of_pzfields:
 7196
 7197            # Explode Infos prefix
 7198            explode_infos_prefix = self.get_explode_infos_prefix()
 7199
 7200            # PZfields tags description
 7201            PZfields_INFOS = {
 7202                f"{pz_prefix}Tags": {
 7203                    "ID": f"{pz_prefix}Tags",
 7204                    "Number": ".",
 7205                    "Type": "String",
 7206                    "Description": "Variant tags based on annotation criteria",
 7207                },
 7208                f"{pz_prefix}Score": {
 7209                    "ID": f"{pz_prefix}Score",
 7210                    "Number": 1,
 7211                    "Type": "Integer",
 7212                    "Description": "Variant score based on annotation criteria",
 7213                },
 7214                f"{pz_prefix}Flag": {
 7215                    "ID": f"{pz_prefix}Flag",
 7216                    "Number": 1,
 7217                    "Type": "String",
 7218                    "Description": "Variant flag based on annotation criteria",
 7219                },
 7220                f"{pz_prefix}Comment": {
 7221                    "ID": f"{pz_prefix}Comment",
 7222                    "Number": ".",
 7223                    "Type": "String",
 7224                    "Description": "Variant comment based on annotation criteria",
 7225                },
 7226                f"{pz_prefix}Infos": {
 7227                    "ID": f"{pz_prefix}Infos",
 7228                    "Number": ".",
 7229                    "Type": "String",
 7230                    "Description": "Variant infos based on annotation criteria",
 7231                },
 7232                f"{pz_prefix}Class": {
 7233                    "ID": f"{pz_prefix}Class",
 7234                    "Number": ".",
 7235                    "Type": "String",
 7236                    "Description": "Variant class based on annotation criteria",
 7237                },
 7238            }
 7239
 7240            # Create INFO fields if not exist
 7241            for field in PZfields_INFOS:
 7242                field_ID = PZfields_INFOS[field]["ID"]
 7243                field_description = PZfields_INFOS[field]["Description"]
 7244                if field_ID not in self.get_header().infos and field_ID in pzfields:
 7245                    field_description = (
 7246                        PZfields_INFOS[field]["Description"]
 7247                        + f", profile {default_profile}"
 7248                    )
 7249                    self.get_header().infos[field_ID] = vcf.parser._Info(
 7250                        field_ID,
 7251                        PZfields_INFOS[field]["Number"],
 7252                        PZfields_INFOS[field]["Type"],
 7253                        field_description,
 7254                        "unknown",
 7255                        "unknown",
 7256                        code_type_map[PZfields_INFOS[field]["Type"]],
 7257                    )
 7258
 7259            # Create INFO fields if not exist for each profile
 7260            for profile in prioritizations_config:
 7261                if profile in profiles or profiles == []:
 7262                    for field in PZfields_INFOS:
 7263                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
 7264                        field_description = (
 7265                            PZfields_INFOS[field]["Description"]
 7266                            + f", profile {profile}"
 7267                        )
 7268                        if (
 7269                            field_ID not in self.get_header().infos
 7270                            and field in pzfields
 7271                        ):
 7272                            self.get_header().infos[field_ID] = vcf.parser._Info(
 7273                                field_ID,
 7274                                PZfields_INFOS[field]["Number"],
 7275                                PZfields_INFOS[field]["Type"],
 7276                                field_description,
 7277                                "unknown",
 7278                                "unknown",
 7279                                code_type_map[PZfields_INFOS[field]["Type"]],
 7280                            )
 7281
 7282            # Header
 7283            for pzfield in list_of_pzfields:
 7284                if re.match(f"{pz_prefix}Score.*", pzfield):
 7285                    added_column = self.add_column(
 7286                        table_name=table_variants,
 7287                        column_name=pzfield,
 7288                        column_type="INTEGER",
 7289                        default_value="0",
 7290                    )
 7291                elif re.match(f"{pz_prefix}Flag.*", pzfield):
 7292                    added_column = self.add_column(
 7293                        table_name=table_variants,
 7294                        column_name=pzfield,
 7295                        column_type="BOOLEAN",
 7296                        default_value="1",
 7297                    )
 7298                elif re.match(f"{pz_prefix}Class.*", pzfield):
 7299                    added_column = self.add_column(
 7300                        table_name=table_variants,
 7301                        column_name=pzfield,
 7302                        column_type="VARCHAR[]",
 7303                        default_value="null",
 7304                    )
 7305                else:
 7306                    added_column = self.add_column(
 7307                        table_name=table_variants,
 7308                        column_name=pzfield,
 7309                        column_type="STRING",
 7310                        default_value="''",
 7311                    )
 7312                added_columns.append(added_column)
 7313
 7314            # Profiles
 7315            if profiles:
 7316
 7317                # foreach profile in configuration file
 7318                for profile in prioritizations_config:
 7319
 7320                    # If profile is asked in param, or ALL are asked (empty profile [])
 7321                    if profile in profiles or profiles == []:
 7322                        log.info(f"Profile '{profile}'")
 7323
 7324                        sql_set_info_option = ""
 7325
 7326                        sql_set_info = []
 7327
 7328                        # PZ fields set
 7329
 7330                        # PZScore
 7331                        if (
 7332                            f"{pz_prefix}Score{pzfields_sep}{profile}"
 7333                            in list_of_pzfields
 7334                        ):
 7335                            sql_set_info.append(
 7336                                f"""
 7337                                    concat(
 7338                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
 7339                                        {pz_prefix}Score{pzfields_sep}{profile}
 7340                                    ) 
 7341                                """
 7342                            )
 7343                            if (
 7344                                profile == default_profile
 7345                                and f"{pz_prefix}Score" in list_of_pzfields
 7346                            ):
 7347                                sql_set_info.append(
 7348                                    f"""
 7349                                        concat(
 7350                                            '{pz_prefix}Score=',
 7351                                            {pz_prefix}Score{pzfields_sep}{profile}
 7352                                        )
 7353                                    """
 7354                                )
 7355
 7356                        # PZFlag
 7357                        if (
 7358                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7359                            in list_of_pzfields
 7360                        ):
 7361                            sql_set_info.append(
 7362                                f"""
 7363                                    concat(
 7364                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
 7365                                        CASE 
 7366                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7367                                            THEN 'PASS'
 7368                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7369                                            THEN 'FILTERED'
 7370                                        END
 7371                                    ) 
 7372                                """
 7373                            )
 7374                            if (
 7375                                profile == default_profile
 7376                                and f"{pz_prefix}Flag" in list_of_pzfields
 7377                            ):
 7378                                sql_set_info.append(
 7379                                    f"""
 7380                                        concat(
 7381                                            '{pz_prefix}Flag=',
 7382                                            CASE 
 7383                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7384                                                THEN 'PASS'
 7385                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7386                                                THEN 'FILTERED'
 7387                                            END
 7388                                        )
 7389                                    """
 7390                                )
 7391
 7392                        # PZClass
 7393                        if (
 7394                            f"{pz_prefix}Class{pzfields_sep}{profile}"
 7395                            in list_of_pzfields
 7396                        ):
 7397                            sql_set_info.append(
 7398                                f"""
 7399                                    concat(
 7400                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
 7401                                        CASE
 7402                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7403                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7404                                            ELSE '.'
 7405                                        END 
 7406                                    )
 7407                                    
 7408                                """
 7409                            )
 7410                            if (
 7411                                profile == default_profile
 7412                                and f"{pz_prefix}Class" in list_of_pzfields
 7413                            ):
 7414                                sql_set_info.append(
 7415                                    f"""
 7416                                        concat(
 7417                                            '{pz_prefix}Class=',
 7418                                            CASE
 7419                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7420                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7421                                                ELSE '.'
 7422                                            END 
 7423                                        )
 7424                                    """
 7425                                )
 7426
 7427                        # PZComment
 7428                        if (
 7429                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7430                            in list_of_pzfields
 7431                        ):
 7432                            sql_set_info.append(
 7433                                f"""
 7434                                    CASE
 7435                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7436                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
 7437                                        ELSE ''
 7438                                    END
 7439                                """
 7440                            )
 7441                            if (
 7442                                profile == default_profile
 7443                                and f"{pz_prefix}Comment" in list_of_pzfields
 7444                            ):
 7445                                sql_set_info.append(
 7446                                    f"""
 7447                                        CASE
 7448                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7449                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
 7450                                            ELSE ''
 7451                                        END
 7452                                    """
 7453                                )
 7454
 7455                        # PZInfos
 7456                        if (
 7457                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7458                            in list_of_pzfields
 7459                        ):
 7460                            sql_set_info.append(
 7461                                f"""
 7462                                    CASE
 7463                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7464                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
 7465                                        ELSE ''
 7466                                    END
 7467                                """
 7468                            )
 7469                            if (
 7470                                profile == default_profile
 7471                                and f"{pz_prefix}Infos" in list_of_pzfields
 7472                            ):
 7473                                sql_set_info.append(
 7474                                    f"""
 7475                                        CASE
 7476                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7477                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
 7478                                            ELSE ''
 7479                                        END
 7480                                    """
 7481                                )
 7482
 7483                        # Merge PZfields
 7484                        sql_set_info_option = ""
 7485                        sql_set_sep = ""
 7486                        for sql_set in sql_set_info:
 7487                            if sql_set_sep:
 7488                                sql_set_info_option += f"""
 7489                                    , concat('{sql_set_sep}', {sql_set})
 7490                                """
 7491                            else:
 7492                                sql_set_info_option += f"""
 7493                                    , {sql_set}
 7494                                """
 7495                            sql_set_sep = ";"
 7496
 7497                        sql_queries = []
 7498                        for annotation in prioritizations_config[profile]:
 7499
 7500                            # skip special sections
 7501                            if annotation.startswith("_"):
 7502                                continue
 7503
 7504                            # For each criterions
 7505                            for criterion in prioritizations_config[profile][
 7506                                annotation
 7507                            ]:
 7508
 7509                                # Criterion mode
 7510                                criterion_mode = None
 7511                                if np.any(
 7512                                    np.isin(list(criterion.keys()), ["type", "value"])
 7513                                ):
 7514                                    criterion_mode = "operation"
 7515                                elif np.any(
 7516                                    np.isin(list(criterion.keys()), ["sql", "fields"])
 7517                                ):
 7518                                    criterion_mode = "sql"
 7519                                log.debug(f"Criterion Mode: {criterion_mode}")
 7520
 7521                                # Criterion parameters
 7522                                criterion_type = criterion.get("type", None)
 7523                                criterion_value = criterion.get("value", None)
 7524                                criterion_sql = criterion.get("sql", None)
 7525                                criterion_fields = criterion.get("fields", None)
 7526                                criterion_score = criterion.get("score", 0)
 7527                                criterion_flag = criterion.get("flag", "PASS")
 7528                                criterion_class = criterion.get("class", None)
 7529                                criterion_flag_bool = criterion_flag == "PASS"
 7530                                criterion_comment = (
 7531                                    ", ".join(criterion.get("comment", []))
 7532                                    .replace("'", "''")
 7533                                    .replace(";", ",")
 7534                                    .replace("\t", " ")
 7535                                )
 7536                                criterion_infos = (
 7537                                    str(criterion)
 7538                                    .replace("'", "''")
 7539                                    .replace(";", ",")
 7540                                    .replace("\t", " ")
 7541                                )
 7542
 7543                                # SQL
 7544                                if criterion_sql is not None and isinstance(
 7545                                    criterion_sql, list
 7546                                ):
 7547                                    criterion_sql = " ".join(criterion_sql)
 7548
 7549                                # Fields and explode
 7550                                if criterion_fields is None:
 7551                                    criterion_fields = [annotation]
 7552                                if not isinstance(criterion_fields, list):
 7553                                    criterion_fields = str(criterion_fields).split(",")
 7554
 7555                                # Class
 7556                                if criterion_class is not None and not isinstance(
 7557                                    criterion_class, list
 7558                                ):
 7559                                    criterion_class = str(criterion_class).split(",")
 7560
 7561                                for annotation_field in criterion_fields:
 7562
 7563                                    # Explode specific annotation
 7564                                    log.debug(
 7565                                        f"Explode annotation '{annotation_field}'"
 7566                                    )
 7567                                    added_columns += self.explode_infos(
 7568                                        prefix=explode_infos_prefix,
 7569                                        fields=[annotation_field],
 7570                                        table=table_variants,
 7571                                    )
 7572                                    extra_infos = self.get_extra_infos(
 7573                                        table=table_variants
 7574                                    )
 7575
 7576                                    # Check if annotation field is present
 7577                                    if (
 7578                                        f"{explode_infos_prefix}{annotation_field}"
 7579                                        not in extra_infos
 7580                                    ):
 7581                                        msq_err = f"Annotation '{annotation_field}' not in data"
 7582                                        log.error(msq_err)
 7583                                        raise ValueError(msq_err)
 7584                                    else:
 7585                                        log.debug(
 7586                                            f"Annotation '{annotation_field}' in data"
 7587                                        )
 7588
 7589                                sql_set = []
 7590                                sql_set_info = []
 7591
 7592                                # PZ fields set
 7593
 7594                                # PZScore
 7595                                if (
 7596                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
 7597                                    in list_of_pzfields
 7598                                ):
 7599                                    # VaRank prioritization score mode
 7600                                    if prioritization_score_mode.upper().strip() in ["VARANK", "MAX", "MAXIMUM", "TOP"]:
 7601                                        sql_set.append(
 7602                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END "
 7603                                        )
 7604                                    # default HOWARD prioritization score mode
 7605                                    else:
 7606                                        sql_set.append(
 7607                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7608                                        )
 7609
 7610                                # PZFlag
 7611                                if (
 7612                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7613                                    in list_of_pzfields
 7614                                ):
 7615                                    sql_set.append(
 7616                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
 7617                                    )
 7618
 7619                                # PZClass
 7620                                if (
 7621                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
 7622                                    in list_of_pzfields
 7623                                    and criterion_class is not None
 7624                                ):
 7625                                    sql_set.append(
 7626                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
 7627                                    )
 7628
 7629                                # PZComment
 7630                                if (
 7631                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7632                                    in list_of_pzfields
 7633                                ):
 7634                                    sql_set.append(
 7635                                        f"""
 7636                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
 7637                                                concat(
 7638                                                    {pz_prefix}Comment{pzfields_sep}{profile},
 7639                                                    CASE 
 7640                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
 7641                                                        THEN ', '
 7642                                                        ELSE ''
 7643                                                    END,
 7644                                                    '{criterion_comment}'
 7645                                                )
 7646                                        """
 7647                                    )
 7648
 7649                                # PZInfos
 7650                                if (
 7651                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7652                                    in list_of_pzfields
 7653                                ):
 7654                                    sql_set.append(
 7655                                        f"""
 7656                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
 7657                                                concat(
 7658                                                    {pz_prefix}Infos{pzfields_sep}{profile},
 7659                                                    '{criterion_infos}'
 7660                                                )
 7661                                        """
 7662                                    )
 7663                                sql_set_option = ",".join(sql_set)
 7664
 7665                                # Criterion and comparison
 7666                                if sql_set_option:
 7667
 7668                                    if criterion_mode in ["operation"]:
 7669
 7670                                        try:
 7671                                            float(criterion_value)
 7672                                            sql_update = f"""
 7673                                                UPDATE {table_variants}
 7674                                                SET {sql_set_option}
 7675                                                WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
 7676                                                AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
 7677                                            """
 7678                                        except:
 7679                                            contains_option = ""
 7680                                            if criterion_type == "contains":
 7681                                                contains_option = ".*"
 7682                                            sql_update = f"""
 7683                                                UPDATE {table_variants}
 7684                                                SET {sql_set_option}
 7685                                                WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
 7686                                            """
 7687                                        sql_queries.append(sql_update)
 7688
 7689                                    elif criterion_mode in ["sql"]:
 7690
 7691                                        sql_update = f"""
 7692                                            UPDATE {table_variants}
 7693                                            SET {sql_set_option}
 7694                                            WHERE {criterion_sql}
 7695                                        """
 7696                                        sql_queries.append(sql_update)
 7697
 7698                                    else:
 7699                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
 7700                                        log.error(msg_err)
 7701                                        raise ValueError(msg_err)
 7702
 7703                                else:
 7704                                    log.warning(
 7705                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
 7706                                    )
 7707
 7708                        # PZTags
 7709                        if (
 7710                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
 7711                            in list_of_pzfields
 7712                        ):
 7713
 7714                            # Create PZFalgs value
 7715                            pztags_value = ""
 7716                            pztags_sep_default = ","
 7717                            pztags_sep = ""
 7718                            for pzfield in pzfields:
 7719                                if pzfield not in [f"{pz_prefix}Tags"]:
 7720                                    if (
 7721                                        f"{pzfield}{pzfields_sep}{profile}"
 7722                                        in list_of_pzfields
 7723                                    ):
 7724                                        if pzfield in [f"{pz_prefix}Flag"]:
 7725                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7726                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
 7727                                                    THEN 'PASS'
 7728                                                    ELSE 'FILTERED'
 7729                                                END, '"""
 7730                                        elif pzfield in [f"{pz_prefix}Class"]:
 7731                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7732                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7733                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7734                                                    ELSE '.'
 7735                                                END, '"""
 7736                                        else:
 7737                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
 7738                                        pztags_sep = pztags_sep_default
 7739
 7740                            # Add Query update for PZFlags
 7741                            sql_update_pztags = f"""
 7742                                UPDATE {table_variants}
 7743                                SET INFO = concat(
 7744                                        INFO,
 7745                                        CASE WHEN INFO NOT in ('','.')
 7746                                                THEN ';'
 7747                                                ELSE ''
 7748                                        END,
 7749                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
 7750                                    )
 7751                                """
 7752                            sql_queries.append(sql_update_pztags)
 7753
 7754                            # Add Query update for PZFlags for default
 7755                            if profile == default_profile:
 7756                                sql_update_pztags_default = f"""
 7757                                UPDATE {table_variants}
 7758                                SET INFO = concat(
 7759                                        INFO,
 7760                                        ';',
 7761                                        '{pz_prefix}Tags={pztags_value}'
 7762                                    )
 7763                                """
 7764                                sql_queries.append(sql_update_pztags_default)
 7765
 7766                        log.info(f"""Profile '{profile}' - Prioritization... """)
 7767
 7768                        if sql_queries:
 7769
 7770                            for sql_query in sql_queries:
 7771                                log.debug(
 7772                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
 7773                                )
 7774                                self.conn.execute(sql_query)
 7775
 7776                        log.info(f"""Profile '{profile}' - Update... """)
 7777                        sql_query_update = f"""
 7778                            UPDATE {table_variants}
 7779                            SET INFO =  
 7780                                concat(
 7781                                    CASE
 7782                                        WHEN INFO NOT IN ('','.')
 7783                                        THEN concat(INFO, ';')
 7784                                        ELSE ''
 7785                                    END
 7786                                    {sql_set_info_option}
 7787                                )
 7788                        """
 7789                        self.conn.execute(sql_query_update)
 7790
 7791        else:
 7792
 7793            log.warning(f"No profiles in parameters")
 7794
 7795        # Remove added columns
 7796        for added_column in added_columns:
 7797            self.drop_column(column=added_column)
 7798
 7799        # Explode INFOS fields into table fields
 7800        if self.get_explode_infos():
 7801            self.explode_infos(
 7802                prefix=self.get_explode_infos_prefix(),
 7803                fields=self.get_explode_infos_fields(),
 7804                force=True,
 7805            )
 7806
 7807        return True
 7808
 7809    ###
 7810    # HGVS
 7811    ###
 7812
 7813    def annotation_hgvs(self, threads: int = None) -> None:
 7814        """
 7815        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
 7816        coordinates and alleles.
 7817
 7818        :param threads: The `threads` parameter is an optional integer that specifies the number of
 7819        threads to use for parallel processing. If no value is provided, it will default to the number
 7820        of threads obtained from the `get_threads()` method
 7821        :type threads: int
 7822        """
 7823
 7824        # Function for each partition of the Dask Dataframe
 7825        def partition_function(partition):
 7826            """
 7827            The function `partition_function` applies the `annotation_hgvs_partition` function to
 7828            each row of a DataFrame called `partition`.
 7829
 7830            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
 7831            to be processed
 7832            :return: the result of applying the "annotation_hgvs_partition" function to each row of
 7833            the "partition" dataframe along the axis 1.
 7834            """
 7835            return partition.apply(annotation_hgvs_partition, axis=1)
 7836
 7837        def annotation_hgvs_partition(row) -> str:
 7838            """
 7839            The function `annotation_hgvs_partition` takes in a row of data and returns a string
 7840            containing a list of HGVS names associated with the given genomic coordinates and alleles.
 7841
 7842            :param row: A dictionary-like object that contains the values for the following keys:
 7843            :return: a string that contains the HGVS names associated with the given row of data.
 7844            """
 7845
 7846            chr = row["CHROM"]
 7847            pos = row["POS"]
 7848            ref = row["REF"]
 7849            alt = row["ALT"]
 7850
 7851            # Find list of associated transcripts
 7852            transcripts_list = list(
 7853                polars_conn.execute(
 7854                    f"""
 7855                SELECT transcript
 7856                FROM refseq_df
 7857                WHERE CHROM='{chr}'
 7858                AND POS={pos}
 7859            """
 7860                )["transcript"]
 7861            )
 7862
 7863            # Full HGVS annotation in list
 7864            hgvs_full_list = []
 7865
 7866            for transcript_name in transcripts_list:
 7867
 7868                # Transcript
 7869                transcript = get_transcript(
 7870                    transcripts=transcripts, transcript_name=transcript_name
 7871                )
 7872                # Exon
 7873                if use_exon:
 7874                    exon = transcript.find_exon_number(pos)
 7875                else:
 7876                    exon = None
 7877                # Protein
 7878                transcript_protein = None
 7879                if use_protein or add_protein or full_format:
 7880                    transcripts_protein = list(
 7881                        polars_conn.execute(
 7882                            f"""
 7883                        SELECT protein
 7884                        FROM refseqlink_df
 7885                        WHERE transcript='{transcript_name}'
 7886                        LIMIT 1
 7887                    """
 7888                        )["protein"]
 7889                    )
 7890                    if len(transcripts_protein):
 7891                        transcript_protein = transcripts_protein[0]
 7892
 7893                # HGVS name
 7894                hgvs_name = format_hgvs_name(
 7895                    chr,
 7896                    pos,
 7897                    ref,
 7898                    alt,
 7899                    genome=genome,
 7900                    transcript=transcript,
 7901                    transcript_protein=transcript_protein,
 7902                    exon=exon,
 7903                    use_gene=use_gene,
 7904                    use_protein=use_protein,
 7905                    full_format=full_format,
 7906                    use_version=use_version,
 7907                    codon_type=codon_type,
 7908                )
 7909                hgvs_full_list.append(hgvs_name)
 7910                if add_protein and not use_protein and not full_format:
 7911                    hgvs_name = format_hgvs_name(
 7912                        chr,
 7913                        pos,
 7914                        ref,
 7915                        alt,
 7916                        genome=genome,
 7917                        transcript=transcript,
 7918                        transcript_protein=transcript_protein,
 7919                        exon=exon,
 7920                        use_gene=use_gene,
 7921                        use_protein=True,
 7922                        full_format=False,
 7923                        use_version=use_version,
 7924                        codon_type=codon_type,
 7925                    )
 7926                    hgvs_full_list.append(hgvs_name)
 7927
 7928            # Create liste of HGVS annotations
 7929            hgvs_full = ",".join(hgvs_full_list)
 7930
 7931            return hgvs_full
 7932
 7933        # Polars connexion
 7934        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7935
 7936        # Config
 7937        config = self.get_config()
 7938
 7939        # Databases
 7940        # Genome
 7941        databases_genomes_folders = (
 7942            config.get("folders", {})
 7943            .get("databases", {})
 7944            .get("genomes", DEFAULT_GENOME_FOLDER)
 7945        )
 7946        databases_genome = (
 7947            config.get("folders", {}).get("databases", {}).get("genomes", "")
 7948        )
 7949        # refseq database folder
 7950        databases_refseq_folders = (
 7951            config.get("folders", {})
 7952            .get("databases", {})
 7953            .get("refseq", DEFAULT_REFSEQ_FOLDER)
 7954        )
 7955        # refseq
 7956        databases_refseq = config.get("databases", {}).get("refSeq", None)
 7957        # refSeqLink
 7958        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
 7959
 7960        # Param
 7961        param = self.get_param()
 7962
 7963        # Quick HGVS
 7964        if "hgvs_options" in param and param.get("hgvs_options", ""):
 7965            log.info(f"Quick HGVS Annotation:")
 7966            if not param.get("hgvs", None):
 7967                param["hgvs"] = {}
 7968            for option in param.get("hgvs_options", "").split(","):
 7969                option_var_val = option.split("=")
 7970                option_var = option_var_val[0]
 7971                if len(option_var_val) > 1:
 7972                    option_val = option_var_val[1]
 7973                else:
 7974                    option_val = "True"
 7975                if option_val.upper() in ["TRUE"]:
 7976                    option_val = True
 7977                elif option_val.upper() in ["FALSE"]:
 7978                    option_val = False
 7979                log.info(f"   {option_var}={option_val}")
 7980                param["hgvs"][option_var] = option_val
 7981
 7982        # Check if HGVS annotation enabled
 7983        if "hgvs" in param:
 7984            log.info(f"HGVS Annotation... ")
 7985            for hgvs_option in param.get("hgvs", {}):
 7986                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
 7987        else:
 7988            return
 7989
 7990        # HGVS Param
 7991        param_hgvs = param.get("hgvs", {})
 7992        use_exon = param_hgvs.get("use_exon", False)
 7993        use_gene = param_hgvs.get("use_gene", False)
 7994        use_protein = param_hgvs.get("use_protein", False)
 7995        add_protein = param_hgvs.get("add_protein", False)
 7996        full_format = param_hgvs.get("full_format", False)
 7997        use_version = param_hgvs.get("use_version", False)
 7998        codon_type = param_hgvs.get("codon_type", "3")
 7999
 8000        # refSseq refSeqLink
 8001        databases_refseq = param_hgvs.get("refseq", databases_refseq)
 8002        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
 8003
 8004        # Assembly
 8005        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 8006
 8007        # Genome
 8008        genome_file = None
 8009        if find_genome(databases_genome):
 8010            genome_file = find_genome(databases_genome)
 8011        else:
 8012            genome_file = find_genome(
 8013                genome_path=databases_genomes_folders, assembly=assembly
 8014            )
 8015        log.debug("Genome: " + str(genome_file))
 8016
 8017        # refSseq
 8018        refseq_file = find_file_prefix(
 8019            input_file=databases_refseq,
 8020            prefix="ncbiRefSeq",
 8021            folder=databases_refseq_folders,
 8022            assembly=assembly,
 8023        )
 8024        log.debug("refSeq: " + str(refseq_file))
 8025
 8026        # refSeqLink
 8027        refseqlink_file = find_file_prefix(
 8028            input_file=databases_refseqlink,
 8029            prefix="ncbiRefSeqLink",
 8030            folder=databases_refseq_folders,
 8031            assembly=assembly,
 8032        )
 8033        log.debug("refSeqLink: " + str(refseqlink_file))
 8034
 8035        # Threads
 8036        if not threads:
 8037            threads = self.get_threads()
 8038        log.debug("Threads: " + str(threads))
 8039
 8040        # Variables
 8041        table_variants = self.get_table_variants(clause="update")
 8042
 8043        # Get variants SNV and InDel only
 8044        query_variants = f"""
 8045            SELECT "#CHROM" AS CHROM, POS, REF, ALT
 8046            FROM {table_variants}
 8047            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
 8048            """
 8049        df_variants = self.get_query_to_df(query_variants)
 8050
 8051        # Added columns
 8052        added_columns = []
 8053
 8054        # Add hgvs column in variants table
 8055        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
 8056        added_column = self.add_column(
 8057            table_variants, hgvs_column_name, "STRING", default_value=None
 8058        )
 8059        added_columns.append(added_column)
 8060
 8061        log.debug(f"refSeq loading...")
 8062        # refSeq in duckDB
 8063        refseq_table = get_refseq_table(
 8064            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
 8065        )
 8066        # Loading all refSeq in Dataframe
 8067        refseq_query = f"""
 8068            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
 8069            FROM {refseq_table}
 8070            JOIN df_variants ON (
 8071                {refseq_table}.chrom = df_variants.CHROM
 8072                AND {refseq_table}.txStart<=df_variants.POS
 8073                AND {refseq_table}.txEnd>=df_variants.POS
 8074            )
 8075        """
 8076        refseq_df = self.conn.query(refseq_query).pl()
 8077
 8078        if refseqlink_file:
 8079            log.debug(f"refSeqLink loading...")
 8080            # refSeqLink in duckDB
 8081            refseqlink_table = get_refseq_table(
 8082                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
 8083            )
 8084            # Loading all refSeqLink in Dataframe
 8085            protacc_column = "protAcc_with_ver"
 8086            mrnaacc_column = "mrnaAcc_with_ver"
 8087            refseqlink_query = f"""
 8088                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
 8089                FROM {refseqlink_table} 
 8090                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
 8091                WHERE protAcc_without_ver IS NOT NULL
 8092            """
 8093            # Polars Dataframe
 8094            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
 8095
 8096        # Read RefSeq transcripts into a python dict/model.
 8097        log.debug(f"Transcripts loading...")
 8098        with tempfile.TemporaryDirectory() as tmpdir:
 8099            transcripts_query = f"""
 8100                COPY (
 8101                    SELECT {refseq_table}.*
 8102                    FROM {refseq_table}
 8103                    JOIN df_variants ON (
 8104                        {refseq_table}.chrom=df_variants.CHROM
 8105                        AND {refseq_table}.txStart<=df_variants.POS
 8106                        AND {refseq_table}.txEnd>=df_variants.POS
 8107                    )
 8108                )
 8109                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
 8110            """
 8111            self.conn.query(transcripts_query)
 8112            with open(f"{tmpdir}/transcript.tsv") as infile:
 8113                transcripts = read_transcripts(infile)
 8114
 8115        # Polars connexion
 8116        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 8117
 8118        log.debug("Genome loading...")
 8119        # Read genome sequence using pyfaidx.
 8120        genome = Fasta(genome_file)
 8121
 8122        log.debug("Start annotation HGVS...")
 8123
 8124        # Create
 8125        # a Dask Dataframe from Pandas dataframe with partition as number of threads
 8126        ddf = dd.from_pandas(df_variants, npartitions=threads)
 8127
 8128        # Use dask.dataframe.apply() to apply function on each partition
 8129        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
 8130
 8131        # Convert Dask DataFrame to Pandas Dataframe
 8132        df = ddf.compute()
 8133
 8134        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
 8135        with tempfile.TemporaryDirectory() as tmpdir:
 8136            df_parquet = os.path.join(tmpdir, "df.parquet")
 8137            df.to_parquet(df_parquet)
 8138
 8139            # Update hgvs column
 8140            update_variant_query = f"""
 8141                UPDATE {table_variants}
 8142                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
 8143                FROM read_parquet('{df_parquet}') as df
 8144                WHERE variants."#CHROM" = df.CHROM
 8145                AND variants.POS = df.POS
 8146                AND variants.REF = df.REF
 8147                AND variants.ALT = df.ALT
 8148                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
 8149                """
 8150            self.execute_query(update_variant_query)
 8151
 8152        # Update INFO column
 8153        sql_query_update = f"""
 8154            UPDATE {table_variants}
 8155            SET INFO = 
 8156                concat(
 8157                    CASE 
 8158                        WHEN INFO NOT IN ('','.')
 8159                        THEN concat(INFO, ';')
 8160                        ELSE ''
 8161                    END,
 8162                    'hgvs=',
 8163                    {hgvs_column_name}
 8164                )
 8165            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
 8166            """
 8167        self.execute_query(sql_query_update)
 8168
 8169        # Add header
 8170        HGVS_INFOS = {
 8171            "hgvs": {
 8172                "ID": "hgvs",
 8173                "Number": ".",
 8174                "Type": "String",
 8175                "Description": f"HGVS annotatation with HOWARD",
 8176            }
 8177        }
 8178
 8179        for field in HGVS_INFOS:
 8180            field_ID = HGVS_INFOS[field]["ID"]
 8181            field_description = HGVS_INFOS[field]["Description"]
 8182            self.get_header().infos[field_ID] = vcf.parser._Info(
 8183                field_ID,
 8184                HGVS_INFOS[field]["Number"],
 8185                HGVS_INFOS[field]["Type"],
 8186                field_description,
 8187                "unknown",
 8188                "unknown",
 8189                code_type_map[HGVS_INFOS[field]["Type"]],
 8190            )
 8191
 8192        # Remove added columns
 8193        for added_column in added_columns:
 8194            self.drop_column(column=added_column)
 8195
 8196    ###
 8197    # Calculation
 8198    ###
 8199
 8200    def get_operations_help(
 8201        self, operations_config_dict: dict = {}, operations_config_file: str = None
 8202    ) -> list:
 8203
 8204        # Init
 8205        operations_help = []
 8206
 8207        # operations
 8208        operations = self.get_config_json(
 8209            name="calculations",
 8210            config_dict=operations_config_dict,
 8211            config_file=operations_config_file,
 8212        )
 8213        for op in operations:
 8214            op_name = operations[op].get("name", op).upper()
 8215            op_description = operations[op].get("description", op_name)
 8216            op_available = operations[op].get("available", False)
 8217            if op_available:
 8218                operations_help.append(f"   {op_name}: {op_description}")
 8219
 8220        # Sort operations
 8221        operations_help.sort()
 8222
 8223        # insert header
 8224        operations_help.insert(0, "Available calculation operations:")
 8225
 8226        # Return
 8227        return operations_help
 8228
 8229    def calculation(
 8230        self,
 8231        operations: dict = {},
 8232        operations_config_dict: dict = {},
 8233        operations_config_file: str = None,
 8234    ) -> None:
 8235        """
 8236        It takes a list of operations, and for each operation, it checks if it's a python or sql
 8237        operation, and then calls the appropriate function
 8238
 8239        param json example:
 8240            "calculation": {
 8241                "NOMEN": {
 8242                    "options": {
 8243                        "hgvs_field": "hgvs"
 8244                    },
 8245                "middle" : null
 8246            }
 8247        """
 8248
 8249        # Param
 8250        param = self.get_param()
 8251
 8252        # CHeck operations config file
 8253        if operations_config_file is None:
 8254            operations_config_file = param.get("calculation", {}).get(
 8255                "calculation_config", None
 8256            )
 8257
 8258        # operations config
 8259        operations_config = self.get_config_json(
 8260            name="calculations",
 8261            config_dict=operations_config_dict,
 8262            config_file=operations_config_file,
 8263        )
 8264
 8265        # Upper keys
 8266        operations_config = {k.upper(): v for k, v in operations_config.items()}
 8267
 8268        # Calculations
 8269
 8270        # Operations from param
 8271        operations = param.get("calculation", {}).get("calculations", operations)
 8272
 8273        # Quick calculation - add
 8274        if param.get("calculations", None):
 8275
 8276            # List of operations
 8277            calculations_list = [
 8278                value.strip() for value in param.get("calculations", "").split(",")
 8279            ]
 8280
 8281            # Log
 8282            log.info(f"Quick Calculations:")
 8283            for calculation_key in calculations_list:
 8284                log.info(f"   {calculation_key}")
 8285
 8286            # Create tmp operations (to keep operation order)
 8287            operations_tmp = {}
 8288            for calculation_operation in calculations_list:
 8289                if calculation_operation.upper() not in operations_tmp:
 8290                    log.debug(
 8291                        f"{calculation_operation}.upper() not in {operations_tmp}"
 8292                    )
 8293                    operations_tmp[calculation_operation.upper()] = {}
 8294                    add_value_into_dict(
 8295                        dict_tree=operations_tmp,
 8296                        sections=[
 8297                            calculation_operation.upper(),
 8298                        ],
 8299                        value=operations.get(calculation_operation.upper(), {}),
 8300                    )
 8301            # Add operations already in param
 8302            for calculation_operation in operations:
 8303                if calculation_operation not in operations_tmp:
 8304                    operations_tmp[calculation_operation] = operations.get(
 8305                        calculation_operation, {}
 8306                    )
 8307
 8308            # Update operations in param
 8309            operations = operations_tmp
 8310
 8311        # Operations for calculation
 8312        if not operations:
 8313            operations = param.get("calculation", {}).get("calculations", {})
 8314
 8315        if operations:
 8316            log.info(f"Calculations...")
 8317
 8318        # For each operations
 8319        for operation_name in operations:
 8320            operation_name = operation_name.upper()
 8321            if operation_name not in [""]:
 8322                if operation_name in operations_config:
 8323                    log.info(f"Calculation '{operation_name}'")
 8324                    operation = operations_config[operation_name]
 8325                    operation_type = operation.get("type", "sql")
 8326                    if operation_type == "python":
 8327                        self.calculation_process_function(
 8328                            operation=operation, operation_name=operation_name
 8329                        )
 8330                    elif operation_type == "sql":
 8331                        self.calculation_process_sql(
 8332                            operation=operation, operation_name=operation_name
 8333                        )
 8334                    else:
 8335                        log.error(
 8336                            f"Operations config: Type '{operation_type}' NOT available"
 8337                        )
 8338                        raise ValueError(
 8339                            f"Operations config: Type '{operation_type}' NOT available"
 8340                        )
 8341                else:
 8342                    log.error(
 8343                        f"Operations config: Calculation '{operation_name}' NOT available"
 8344                    )
 8345                    raise ValueError(
 8346                        f"Operations config: Calculation '{operation_name}' NOT available"
 8347                    )
 8348
 8349        # Explode INFOS fields into table fields
 8350        if self.get_explode_infos():
 8351            self.explode_infos(
 8352                prefix=self.get_explode_infos_prefix(),
 8353                fields=self.get_explode_infos_fields(),
 8354                force=True,
 8355            )
 8356
 8357    def calculation_process_sql(
 8358        self, operation: dict, operation_name: str = "unknown"
 8359    ) -> None:
 8360        """
 8361        The `calculation_process_sql` function takes in a mathematical operation as a string and
 8362        performs the operation, updating the specified table with the result.
 8363
 8364        :param operation: The `operation` parameter is a dictionary that contains information about the
 8365        mathematical operation to be performed. It includes the following keys:
 8366        :type operation: dict
 8367        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8368        the mathematical operation being performed. It is used for logging and error handling purposes,
 8369        defaults to unknown
 8370        :type operation_name: str (optional)
 8371        """
 8372
 8373        # Operation infos
 8374        operation_name = operation.get("name", "unknown")
 8375        log.debug(f"process SQL {operation_name}")
 8376        output_column_name = operation.get("output_column_name", operation_name)
 8377        output_column_type = operation.get("output_column_type", "String")
 8378        prefix = operation.get("explode_infos_prefix", "")
 8379        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
 8380        output_column_description = operation.get(
 8381            "output_column_description", f"{operation_name} operation"
 8382        )
 8383        operation_query = operation.get("operation_query", None)
 8384        if isinstance(operation_query, list):
 8385            operation_query = " ".join(operation_query)
 8386        operation_info_fields = operation.get("info_fields", [])
 8387        operation_info_fields_check = operation.get("info_fields_check", False)
 8388        operation_info = operation.get("operation_info", True)
 8389        operation_table = operation.get(
 8390            "table", self.get_table_variants(clause="alter")
 8391        )
 8392
 8393        # table variants
 8394        if operation_table:
 8395            table_variants = operation_table
 8396        else:
 8397            table_variants = self.get_table_variants(clause="alter")
 8398
 8399        if operation_query:
 8400
 8401            # Info fields check
 8402            operation_info_fields_check_result = True
 8403            if operation_info_fields_check:
 8404                header_infos = self.get_header().infos
 8405                for info_field in operation_info_fields:
 8406                    operation_info_fields_check_result = (
 8407                        operation_info_fields_check_result
 8408                        and info_field in header_infos
 8409                    )
 8410
 8411            # If info fields available
 8412            if operation_info_fields_check_result:
 8413
 8414                # Added_columns
 8415                added_columns = []
 8416
 8417                # Create VCF header field
 8418                vcf_reader = self.get_header()
 8419                vcf_reader.infos[output_column_name] = vcf.parser._Info(
 8420                    output_column_name,
 8421                    ".",
 8422                    output_column_type,
 8423                    output_column_description,
 8424                    "howard calculation",
 8425                    "0",
 8426                    self.code_type_map.get(output_column_type),
 8427                )
 8428
 8429                # Explode infos if needed
 8430                log.debug(f"calculation_process_sql prefix {prefix}")
 8431                added_columns += self.explode_infos(
 8432                    prefix=prefix,
 8433                    fields=[output_column_name] + operation_info_fields,
 8434                    force=False,
 8435                    table=table_variants,
 8436                )
 8437
 8438                # Create column
 8439                added_column = self.add_column(
 8440                    table_name=table_variants,
 8441                    column_name=prefix + output_column_name,
 8442                    column_type=output_column_type_sql,
 8443                    default_value="null",
 8444                )
 8445                added_columns.append(added_column)
 8446
 8447                # Operation calculation
 8448                try:
 8449
 8450                    # Query to update calculation column
 8451                    sql_update = f"""
 8452                        UPDATE {table_variants}
 8453                        SET "{prefix}{output_column_name}" = ({operation_query})
 8454                    """
 8455                    self.conn.execute(sql_update)
 8456
 8457                    # Add to INFO
 8458                    if operation_info:
 8459                        sql_update_info = f"""
 8460                            UPDATE {table_variants}
 8461                            SET "INFO" =
 8462                                concat(
 8463                                    CASE
 8464                                        WHEN "INFO" IS NOT NULL
 8465                                        THEN concat("INFO", ';')
 8466                                        ELSE ''
 8467                                    END,
 8468                                    '{output_column_name}=',
 8469                                    "{prefix}{output_column_name}"
 8470                                )
 8471                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
 8472                        """
 8473                        self.conn.execute(sql_update_info)
 8474
 8475                except:
 8476                    log.error(
 8477                        f"Operations config: Calculation '{operation_name}' query failed"
 8478                    )
 8479                    raise ValueError(
 8480                        f"Operations config: Calculation '{operation_name}' query failed"
 8481                    )
 8482
 8483                # Remove added columns
 8484                for added_column in added_columns:
 8485                    log.debug(f"added_column: {added_column}")
 8486                    self.drop_column(column=added_column)
 8487
 8488            else:
 8489                log.error(
 8490                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8491                )
 8492                raise ValueError(
 8493                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8494                )
 8495
 8496        else:
 8497            log.error(
 8498                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8499            )
 8500            raise ValueError(
 8501                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8502            )
 8503
 8504    def calculation_process_function(
 8505        self, operation: dict, operation_name: str = "unknown"
 8506    ) -> None:
 8507        """
 8508        The `calculation_process_function` takes in an operation dictionary and performs the specified
 8509        function with the given parameters.
 8510
 8511        :param operation: The `operation` parameter is a dictionary that contains information about the
 8512        operation to be performed. It has the following keys:
 8513        :type operation: dict
 8514        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8515        the operation being performed. It is used for logging purposes, defaults to unknown
 8516        :type operation_name: str (optional)
 8517        """
 8518
 8519        operation_name = operation["name"]
 8520        log.debug(f"process Python {operation_name}")
 8521        function_name = operation["function_name"]
 8522        function_params = operation["function_params"]
 8523        getattr(self, function_name)(*function_params)
 8524
 8525    def calculation_variant_id(self) -> None:
 8526        """
 8527        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
 8528        updates the INFO field of a variants table with the variant ID.
 8529        """
 8530
 8531        # variant_id annotation field
 8532        variant_id_tag = self.get_variant_id_column()
 8533        added_columns = [variant_id_tag]
 8534
 8535        # variant_id hgvs tags"
 8536        vcf_infos_tags = {
 8537            variant_id_tag: "howard variant ID annotation",
 8538        }
 8539
 8540        # Variants table
 8541        table_variants = self.get_table_variants()
 8542
 8543        # Header
 8544        vcf_reader = self.get_header()
 8545
 8546        # Add variant_id to header
 8547        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
 8548            variant_id_tag,
 8549            ".",
 8550            "String",
 8551            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
 8552            "howard calculation",
 8553            "0",
 8554            self.code_type_map.get("String"),
 8555        )
 8556
 8557        # Update
 8558        sql_update = f"""
 8559            UPDATE {table_variants}
 8560            SET "INFO" = 
 8561                concat(
 8562                    CASE
 8563                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8564                        THEN ''
 8565                        ELSE concat("INFO", ';')
 8566                    END,
 8567                    '{variant_id_tag}=',
 8568                    "{variant_id_tag}"
 8569                )
 8570        """
 8571        self.conn.execute(sql_update)
 8572
 8573        # Remove added columns
 8574        for added_column in added_columns:
 8575            self.drop_column(column=added_column)
 8576
 8577    def calculation_extract_snpeff_hgvs(
 8578        self,
 8579        snpeff_hgvs: str = "snpeff_hgvs",
 8580        snpeff_field: str = "ANN",
 8581    ) -> None:
 8582        """
 8583        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
 8584        annotation field in a VCF file and adds them as a new column in the variants table.
 8585
 8586        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
 8587        function is used to specify the name of the column that will store the HGVS nomenclatures
 8588        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
 8589        snpeff_hgvs
 8590        :type snpeff_hgvs: str (optional)
 8591        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
 8592        function represents the field in the VCF file that contains SnpEff annotations. This field is
 8593        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
 8594        to ANN
 8595        :type snpeff_field: str (optional)
 8596        """
 8597
 8598        # Snpeff hgvs tags
 8599        vcf_infos_tags = {
 8600            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
 8601        }
 8602
 8603        # Prefix
 8604        prefix = self.get_explode_infos_prefix()
 8605        if prefix:
 8606            prefix = "INFO/"
 8607
 8608        # snpEff fields
 8609        speff_ann_infos = prefix + snpeff_field
 8610        speff_hgvs_infos = prefix + snpeff_hgvs
 8611
 8612        # Variants table
 8613        table_variants = self.get_table_variants()
 8614
 8615        # Header
 8616        vcf_reader = self.get_header()
 8617
 8618        # Add columns
 8619        added_columns = []
 8620
 8621        # Explode HGVS field in column
 8622        added_columns += self.explode_infos(fields=[snpeff_field])
 8623
 8624        if snpeff_field in vcf_reader.infos:
 8625
 8626            log.debug(vcf_reader.infos[snpeff_field])
 8627
 8628            # Extract ANN header
 8629            ann_description = vcf_reader.infos[snpeff_field].desc
 8630            pattern = r"'(.+?)'"
 8631            match = re.search(pattern, ann_description)
 8632            if match:
 8633                ann_header_match = match.group(1).split(" | ")
 8634                ann_header_desc = {}
 8635                for i in range(len(ann_header_match)):
 8636                    ann_header_info = "".join(
 8637                        char for char in ann_header_match[i] if char.isalnum()
 8638                    )
 8639                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8640                if not ann_header_desc:
 8641                    raise ValueError("Invalid header description format")
 8642            else:
 8643                raise ValueError("Invalid header description format")
 8644
 8645            # Create variant id
 8646            variant_id_column = self.get_variant_id_column()
 8647            added_columns += [variant_id_column]
 8648
 8649            # Create dataframe
 8650            dataframe_snpeff_hgvs = self.get_query_to_df(
 8651                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8652            )
 8653
 8654            # Create main NOMEN column
 8655            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8656                speff_ann_infos
 8657            ].apply(
 8658                lambda x: extract_snpeff_hgvs(
 8659                    str(x), header=list(ann_header_desc.values())
 8660                )
 8661            )
 8662
 8663            # Add snpeff_hgvs to header
 8664            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
 8665                snpeff_hgvs,
 8666                ".",
 8667                "String",
 8668                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
 8669                "howard calculation",
 8670                "0",
 8671                self.code_type_map.get("String"),
 8672            )
 8673
 8674            # Update
 8675            sql_update = f"""
 8676                UPDATE variants
 8677                SET "INFO" = 
 8678                    concat(
 8679                        CASE
 8680                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8681                            THEN ''
 8682                            ELSE concat("INFO", ';')
 8683                        END,
 8684                        CASE 
 8685                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8686                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8687                            THEN concat(
 8688                                    '{snpeff_hgvs}=',
 8689                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8690                                )
 8691                            ELSE ''
 8692                        END
 8693                    )
 8694                FROM dataframe_snpeff_hgvs
 8695                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8696
 8697            """
 8698            self.conn.execute(sql_update)
 8699
 8700            # Delete dataframe
 8701            del dataframe_snpeff_hgvs
 8702            gc.collect()
 8703
 8704        else:
 8705
 8706            log.warning(
 8707                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8708            )
 8709
 8710        # Remove added columns
 8711        for added_column in added_columns:
 8712            self.drop_column(column=added_column)
 8713
 8714    def calculation_snpeff_ann_explode(
 8715        self,
 8716        uniquify: bool = True,
 8717        output_format: str = "fields",
 8718        output_prefix: str = "snpeff_",
 8719        snpeff_field: str = "ANN",
 8720    ) -> None:
 8721        """
 8722        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
 8723        exploding the HGVS field and updating variant information accordingly.
 8724
 8725        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
 8726        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
 8727        it indicates that the output should be unique, meaning that duplicate entries should be removed,
 8728        defaults to True
 8729        :type uniquify: bool (optional)
 8730        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
 8731        function specifies the format in which the output annotations will be generated. It has a
 8732        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
 8733        format, defaults to fields
 8734        :type output_format: str (optional)
 8735        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
 8736        method is used to specify the prefix that will be added to the output annotations generated
 8737        during the calculation process. This prefix helps to differentiate the newly added annotations
 8738        from existing ones in the output data. By default, the, defaults to ANN_
 8739        :type output_prefix: str (optional)
 8740        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
 8741        function is used to specify the field in the VCF file that contains SnpEff annotations. This
 8742        field will be processed to explode the HGVS annotations and update the variant information
 8743        accordingly, defaults to ANN
 8744        :type snpeff_field: str (optional)
 8745        """
 8746
 8747        # SnpEff annotation field
 8748        snpeff_hgvs = "snpeff_ann_explode"
 8749
 8750        # Snpeff hgvs tags
 8751        vcf_infos_tags = {
 8752            snpeff_hgvs: "Explode snpEff annotations",
 8753        }
 8754
 8755        # Prefix
 8756        prefix = self.get_explode_infos_prefix()
 8757        if prefix:
 8758            prefix = "INFO/"
 8759
 8760        # snpEff fields
 8761        speff_ann_infos = prefix + snpeff_field
 8762        speff_hgvs_infos = prefix + snpeff_hgvs
 8763
 8764        # Variants table
 8765        table_variants = self.get_table_variants()
 8766
 8767        # Header
 8768        vcf_reader = self.get_header()
 8769
 8770        # Add columns
 8771        added_columns = []
 8772
 8773        # Explode HGVS field in column
 8774        added_columns += self.explode_infos(fields=[snpeff_field])
 8775        log.debug(f"snpeff_field={snpeff_field}")
 8776        log.debug(f"added_columns={added_columns}")
 8777
 8778        if snpeff_field in vcf_reader.infos:
 8779
 8780            # Extract ANN header
 8781            ann_description = vcf_reader.infos[snpeff_field].desc
 8782            pattern = r"'(.+?)'"
 8783            match = re.search(pattern, ann_description)
 8784            if match:
 8785                ann_header_match = match.group(1).split(" | ")
 8786                ann_header = []
 8787                ann_header_desc = {}
 8788                for i in range(len(ann_header_match)):
 8789                    ann_header_info = "".join(
 8790                        char for char in ann_header_match[i] if char.isalnum()
 8791                    )
 8792                    ann_header.append(ann_header_info)
 8793                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8794                if not ann_header_desc:
 8795                    raise ValueError("Invalid header description format")
 8796            else:
 8797                raise ValueError("Invalid header description format")
 8798
 8799            # Create variant id
 8800            variant_id_column = self.get_variant_id_column()
 8801            added_columns += [variant_id_column]
 8802
 8803            # Create dataframe
 8804            dataframe_snpeff_hgvs = self.get_query_to_df(
 8805                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8806            )
 8807
 8808            # Create snpEff columns
 8809            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8810                speff_ann_infos
 8811            ].apply(
 8812                lambda x: explode_snpeff_ann(
 8813                    str(x),
 8814                    uniquify=uniquify,
 8815                    output_format=output_format,
 8816                    prefix=output_prefix,
 8817                    header=list(ann_header_desc.values()),
 8818                )
 8819            )
 8820
 8821            # Header
 8822            ann_annotations_prefix = ""
 8823            if output_format.upper() in ["JSON"]:
 8824                ann_annotations_prefix = f"{output_prefix}="
 8825                vcf_reader.infos[output_prefix] = vcf.parser._Info(
 8826                    output_prefix,
 8827                    ".",
 8828                    "String",
 8829                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8830                    + " - JSON format",
 8831                    "howard calculation",
 8832                    "0",
 8833                    self.code_type_map.get("String"),
 8834                )
 8835            else:
 8836                for ann_annotation in ann_header:
 8837                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
 8838                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
 8839                        ann_annotation_id,
 8840                        ".",
 8841                        "String",
 8842                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8843                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
 8844                        "howard calculation",
 8845                        "0",
 8846                        self.code_type_map.get("String"),
 8847                    )
 8848
 8849            # Update
 8850            sql_update = f"""
 8851                UPDATE variants
 8852                SET "INFO" = 
 8853                    concat(
 8854                        CASE
 8855                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8856                            THEN ''
 8857                            ELSE concat("INFO", ';')
 8858                        END,
 8859                        CASE 
 8860                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8861                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8862                            THEN concat(
 8863                                '{ann_annotations_prefix}',
 8864                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8865                                )
 8866                            ELSE ''
 8867                        END
 8868                    )
 8869                FROM dataframe_snpeff_hgvs
 8870                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8871
 8872            """
 8873            self.conn.execute(sql_update)
 8874
 8875            # Delete dataframe
 8876            del dataframe_snpeff_hgvs
 8877            gc.collect()
 8878
 8879        else:
 8880
 8881            log.warning(
 8882                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8883            )
 8884
 8885        # Remove added columns
 8886        for added_column in added_columns:
 8887            self.drop_column(column=added_column)
 8888
 8889    def calculation_extract_nomen(self) -> None:
 8890        """
 8891        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
 8892        """
 8893
 8894        # NOMEN field
 8895        field_nomen_dict = "NOMEN_DICT"
 8896
 8897        # NOMEN structure
 8898        nomen_dict = {
 8899            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
 8900            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
 8901            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
 8902            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
 8903            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
 8904            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
 8905            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
 8906            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
 8907            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
 8908            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
 8909        }
 8910
 8911        # Param
 8912        param = self.get_param()
 8913
 8914        # Prefix
 8915        prefix = self.get_explode_infos_prefix()
 8916
 8917        # Header
 8918        vcf_reader = self.get_header()
 8919
 8920        # Added columns
 8921        added_columns = []
 8922
 8923        # Get HGVS field
 8924        hgvs_field = (
 8925            param.get("calculation", {})
 8926            .get("calculations", {})
 8927            .get("NOMEN", {})
 8928            .get("options", {})
 8929            .get("hgvs_field", "hgvs")
 8930        )
 8931
 8932        # Get NOMEN pattern
 8933        nomen_pattern = (
 8934            param.get("calculation", {})
 8935            .get("calculations", {})
 8936            .get("NOMEN", {})
 8937            .get("options", {})
 8938            .get("pattern", None)
 8939        )
 8940
 8941        # transcripts list of preference sources
 8942        transcripts_sources = {}
 8943
 8944        # Get transcripts
 8945        transcripts_file = (
 8946            param.get("calculation", {})
 8947            .get("calculations", {})
 8948            .get("NOMEN", {})
 8949            .get("options", {})
 8950            .get("transcripts", None)
 8951        )
 8952        transcripts_file = full_path(transcripts_file)
 8953        if transcripts_file:
 8954            if os.path.exists(transcripts_file):
 8955                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
 8956                transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist()
 8957                transcripts_sources["file"] = transcripts_from_file
 8958            else:
 8959                msg_err = f"Transcript file '{transcripts_file}' does NOT exist"
 8960                log.error(msg_err)
 8961                raise ValueError(msg_err)
 8962
 8963        # Get transcripts table
 8964        transcripts_table = (
 8965            param.get("calculation", {})
 8966            .get("calculations", {})
 8967            .get("NOMEN", {})
 8968            .get("options", {})
 8969            .get("transcripts_table", self.get_table_variants())
 8970        )
 8971        # Get transcripts column
 8972        transcripts_column = (
 8973            param.get("calculation", {})
 8974            .get("calculations", {})
 8975            .get("NOMEN", {})
 8976            .get("options", {})
 8977            .get("transcripts_column", None)
 8978        )
 8979
 8980        if transcripts_table and transcripts_column:
 8981            extra_field_transcript = f"{transcripts_table}.{transcripts_column}"
 8982            # Explode if not exists
 8983            added_columns += self.explode_infos(fields=[transcripts_column], table=transcripts_table)
 8984        else:
 8985            extra_field_transcript = f"NULL"
 8986
 8987        # Transcripts of preference source order
 8988        transcripts_order = (
 8989            param.get("calculation", {})
 8990            .get("calculations", {})
 8991            .get("NOMEN", {})
 8992            .get("options", {})
 8993            .get("transcripts_order", ["column", "file"])
 8994        )
 8995
 8996        # Transcripts from file
 8997        transcripts = transcripts_sources.get("file", [])
 8998
 8999        # Explode HGVS field in column
 9000        added_columns += self.explode_infos(fields=[hgvs_field])
 9001
 9002        # extra infos
 9003        extra_infos = self.get_extra_infos()
 9004        extra_field = prefix + hgvs_field
 9005
 9006        if extra_field in extra_infos:
 9007
 9008            # Create dataframe
 9009            dataframe_hgvs = self.get_query_to_df(
 9010                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """
 9011            )
 9012
 9013            # Create main NOMEN column
 9014            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply(
 9015                lambda x: find_nomen(
 9016                    hgvs=x.hgvs,
 9017                    transcript=x.transcript,
 9018                    transcripts=transcripts,
 9019                    pattern=nomen_pattern,
 9020                    transcripts_source_order=transcripts_order,
 9021                ),
 9022                axis=1,
 9023            )
 9024
 9025            # Explode NOMEN Structure and create SQL set for update
 9026            sql_nomen_fields = []
 9027            for nomen_field in nomen_dict:
 9028
 9029                # Explode each field into a column
 9030                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
 9031                    lambda x: dict(x).get(nomen_field, "")
 9032                )
 9033
 9034                # Create VCF header field
 9035                vcf_reader.infos[nomen_field] = vcf.parser._Info(
 9036                    nomen_field,
 9037                    ".",
 9038                    "String",
 9039                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
 9040                    "howard calculation",
 9041                    "0",
 9042                    self.code_type_map.get("String"),
 9043                )
 9044                sql_nomen_fields.append(
 9045                    f"""
 9046                        CASE 
 9047                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
 9048                            THEN concat(
 9049                                    ';{nomen_field}=',
 9050                                    dataframe_hgvs."{nomen_field}"
 9051                                )
 9052                            ELSE ''
 9053                        END
 9054                    """
 9055                )
 9056
 9057            # SQL set for update
 9058            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
 9059
 9060            # Update
 9061            sql_update = f"""
 9062                UPDATE variants
 9063                SET "INFO" = 
 9064                    concat(
 9065                        CASE
 9066                            WHEN "INFO" IS NULL
 9067                            THEN ''
 9068                            ELSE "INFO"
 9069                        END,
 9070                        {sql_nomen_fields_set}
 9071                    )
 9072                FROM dataframe_hgvs
 9073                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
 9074                    AND variants."POS" = dataframe_hgvs."POS" 
 9075                    AND variants."REF" = dataframe_hgvs."REF"
 9076                    AND variants."ALT" = dataframe_hgvs."ALT"
 9077            """
 9078            self.conn.execute(sql_update)
 9079
 9080            # Delete dataframe
 9081            del dataframe_hgvs
 9082            gc.collect()
 9083
 9084        # Remove added columns
 9085        for added_column in added_columns:
 9086            self.drop_column(column=added_column)
 9087
 9088    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
 9089        """
 9090        The function `calculation_find_by_pipeline` performs a calculation to find the number of
 9091        pipeline/sample for a variant and updates the variant information in a VCF file.
 9092
 9093        :param tag: The `tag` parameter is a string that represents the annotation field for the
 9094        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
 9095        VCF header and to update the corresponding field in the variants table, defaults to
 9096        findbypipeline
 9097        :type tag: str (optional)
 9098        """
 9099
 9100        # if FORMAT and samples
 9101        if (
 9102            "FORMAT" in self.get_header_columns_as_list()
 9103            and self.get_header_sample_list()
 9104        ):
 9105
 9106            # findbypipeline annotation field
 9107            findbypipeline_tag = tag
 9108
 9109            # VCF infos tags
 9110            vcf_infos_tags = {
 9111                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
 9112            }
 9113
 9114            # Prefix
 9115            prefix = self.get_explode_infos_prefix()
 9116
 9117            # Field
 9118            findbypipeline_infos = prefix + findbypipeline_tag
 9119
 9120            # Variants table
 9121            table_variants = self.get_table_variants()
 9122
 9123            # Header
 9124            vcf_reader = self.get_header()
 9125
 9126            # Create variant id
 9127            variant_id_column = self.get_variant_id_column()
 9128            added_columns = [variant_id_column]
 9129
 9130            # variant_id, FORMAT and samples
 9131            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9132                self.get_header_sample_list()
 9133            )
 9134
 9135            # Create dataframe
 9136            dataframe_findbypipeline = self.get_query_to_df(
 9137                f""" SELECT {samples_fields} FROM {table_variants} """
 9138            )
 9139
 9140            # Create findbypipeline column
 9141            dataframe_findbypipeline[findbypipeline_infos] = (
 9142                dataframe_findbypipeline.apply(
 9143                    lambda row: findbypipeline(
 9144                        row, samples=self.get_header_sample_list()
 9145                    ),
 9146                    axis=1,
 9147                )
 9148            )
 9149
 9150            # Add snpeff_hgvs to header
 9151            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
 9152                findbypipeline_tag,
 9153                ".",
 9154                "String",
 9155                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
 9156                "howard calculation",
 9157                "0",
 9158                self.code_type_map.get("String"),
 9159            )
 9160
 9161            # Update
 9162            sql_update = f"""
 9163                UPDATE variants
 9164                SET "INFO" = 
 9165                    concat(
 9166                        CASE
 9167                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9168                            THEN ''
 9169                            ELSE concat("INFO", ';')
 9170                        END,
 9171                        CASE 
 9172                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
 9173                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
 9174                            THEN concat(
 9175                                    '{findbypipeline_tag}=',
 9176                                    dataframe_findbypipeline."{findbypipeline_infos}"
 9177                                )
 9178                            ELSE ''
 9179                        END
 9180                    )
 9181                FROM dataframe_findbypipeline
 9182                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
 9183            """
 9184            self.conn.execute(sql_update)
 9185
 9186            # Remove added columns
 9187            for added_column in added_columns:
 9188                self.drop_column(column=added_column)
 9189
 9190            # Delete dataframe
 9191            del dataframe_findbypipeline
 9192            gc.collect()
 9193
 9194    def calculation_genotype_concordance(self) -> None:
 9195        """
 9196        The function `calculation_genotype_concordance` calculates the genotype concordance for
 9197        multi-caller VCF files and updates the variant information in the database.
 9198        """
 9199
 9200        # if FORMAT and samples
 9201        if (
 9202            "FORMAT" in self.get_header_columns_as_list()
 9203            and self.get_header_sample_list()
 9204        ):
 9205
 9206            # genotypeconcordance annotation field
 9207            genotypeconcordance_tag = "genotypeconcordance"
 9208
 9209            # VCF infos tags
 9210            vcf_infos_tags = {
 9211                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
 9212            }
 9213
 9214            # Prefix
 9215            prefix = self.get_explode_infos_prefix()
 9216
 9217            # Field
 9218            genotypeconcordance_infos = prefix + genotypeconcordance_tag
 9219
 9220            # Variants table
 9221            table_variants = self.get_table_variants()
 9222
 9223            # Header
 9224            vcf_reader = self.get_header()
 9225
 9226            # Create variant id
 9227            variant_id_column = self.get_variant_id_column()
 9228            added_columns = [variant_id_column]
 9229
 9230            # variant_id, FORMAT and samples
 9231            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9232                self.get_header_sample_list()
 9233            )
 9234
 9235            # Create dataframe
 9236            dataframe_genotypeconcordance = self.get_query_to_df(
 9237                f""" SELECT {samples_fields} FROM {table_variants} """
 9238            )
 9239
 9240            # Create genotypeconcordance column
 9241            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
 9242                dataframe_genotypeconcordance.apply(
 9243                    lambda row: genotypeconcordance(
 9244                        row, samples=self.get_header_sample_list()
 9245                    ),
 9246                    axis=1,
 9247                )
 9248            )
 9249
 9250            # Add genotypeconcordance to header
 9251            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
 9252                genotypeconcordance_tag,
 9253                ".",
 9254                "String",
 9255                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
 9256                "howard calculation",
 9257                "0",
 9258                self.code_type_map.get("String"),
 9259            )
 9260
 9261            # Update
 9262            sql_update = f"""
 9263                UPDATE variants
 9264                SET "INFO" = 
 9265                    concat(
 9266                        CASE
 9267                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9268                            THEN ''
 9269                            ELSE concat("INFO", ';')
 9270                        END,
 9271                        CASE
 9272                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
 9273                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
 9274                            THEN concat(
 9275                                    '{genotypeconcordance_tag}=',
 9276                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
 9277                                )
 9278                            ELSE ''
 9279                        END
 9280                    )
 9281                FROM dataframe_genotypeconcordance
 9282                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
 9283            """
 9284            self.conn.execute(sql_update)
 9285
 9286            # Remove added columns
 9287            for added_column in added_columns:
 9288                self.drop_column(column=added_column)
 9289
 9290            # Delete dataframe
 9291            del dataframe_genotypeconcordance
 9292            gc.collect()
 9293
 9294    def calculation_barcode(self, tag: str = "barcode") -> None:
 9295        """
 9296        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
 9297        updates the INFO field in the file with the calculated barcode values.
 9298
 9299        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
 9300        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
 9301        the default tag name is set to "barcode", defaults to barcode
 9302        :type tag: str (optional)
 9303        """
 9304
 9305        # if FORMAT and samples
 9306        if (
 9307            "FORMAT" in self.get_header_columns_as_list()
 9308            and self.get_header_sample_list()
 9309        ):
 9310
 9311            # barcode annotation field
 9312            if not tag:
 9313                tag = "barcode"
 9314
 9315            # VCF infos tags
 9316            vcf_infos_tags = {
 9317                tag: "barcode calculation (VaRank)",
 9318            }
 9319
 9320            # Prefix
 9321            prefix = self.get_explode_infos_prefix()
 9322
 9323            # Field
 9324            barcode_infos = prefix + tag
 9325
 9326            # Variants table
 9327            table_variants = self.get_table_variants()
 9328
 9329            # Header
 9330            vcf_reader = self.get_header()
 9331
 9332            # Create variant id
 9333            variant_id_column = self.get_variant_id_column()
 9334            added_columns = [variant_id_column]
 9335
 9336            # variant_id, FORMAT and samples
 9337            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9338                self.get_header_sample_list()
 9339            )
 9340
 9341            # Create dataframe
 9342            dataframe_barcode = self.get_query_to_df(
 9343                f""" SELECT {samples_fields} FROM {table_variants} """
 9344            )
 9345
 9346            # Create barcode column
 9347            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9348                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
 9349            )
 9350
 9351            # Add barcode to header
 9352            vcf_reader.infos[tag] = vcf.parser._Info(
 9353                tag,
 9354                ".",
 9355                "String",
 9356                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
 9357                "howard calculation",
 9358                "0",
 9359                self.code_type_map.get("String"),
 9360            )
 9361
 9362            # Update
 9363            sql_update = f"""
 9364                UPDATE {table_variants}
 9365                SET "INFO" = 
 9366                    concat(
 9367                        CASE
 9368                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9369                            THEN ''
 9370                            ELSE concat("INFO", ';')
 9371                        END,
 9372                        CASE
 9373                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
 9374                            AND dataframe_barcode."{barcode_infos}" NOT NULL
 9375                            THEN concat(
 9376                                    '{tag}=',
 9377                                    dataframe_barcode."{barcode_infos}"
 9378                                )
 9379                            ELSE ''
 9380                        END
 9381                    )
 9382                FROM dataframe_barcode
 9383                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9384            """
 9385            self.conn.execute(sql_update)
 9386
 9387            # Remove added columns
 9388            for added_column in added_columns:
 9389                self.drop_column(column=added_column)
 9390
 9391            # Delete dataframe
 9392            del dataframe_barcode
 9393            gc.collect()
 9394
 9395    def calculation_barcode_family(self, tag: str = "BCF") -> None:
 9396        """
 9397        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
 9398        and updates the INFO field in the file with the calculated barcode values.
 9399
 9400        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
 9401        the barcode tag that will be added to the VCF file during the calculation process. If no value
 9402        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
 9403        :type tag: str (optional)
 9404        """
 9405
 9406        # if FORMAT and samples
 9407        if (
 9408            "FORMAT" in self.get_header_columns_as_list()
 9409            and self.get_header_sample_list()
 9410        ):
 9411
 9412            # barcode annotation field
 9413            if not tag:
 9414                tag = "BCF"
 9415
 9416            # VCF infos tags
 9417            vcf_infos_tags = {
 9418                tag: "barcode family calculation",
 9419                f"{tag}S": "barcode family samples",
 9420            }
 9421
 9422            # Param
 9423            param = self.get_param()
 9424            log.debug(f"param={param}")
 9425
 9426            # Prefix
 9427            prefix = self.get_explode_infos_prefix()
 9428
 9429            # PED param
 9430            ped = (
 9431                param.get("calculation", {})
 9432                .get("calculations", {})
 9433                .get("BARCODEFAMILY", {})
 9434                .get("family_pedigree", None)
 9435            )
 9436            log.debug(f"ped={ped}")
 9437
 9438            # Load PED
 9439            if ped:
 9440
 9441                # Pedigree is a file
 9442                if isinstance(ped, str) and os.path.exists(full_path(ped)):
 9443                    log.debug("Pedigree is file")
 9444                    with open(full_path(ped)) as ped:
 9445                        ped = yaml.safe_load(ped)
 9446
 9447                # Pedigree is a string
 9448                elif isinstance(ped, str):
 9449                    log.debug("Pedigree is str")
 9450                    try:
 9451                        ped = json.loads(ped)
 9452                        log.debug("Pedigree is json str")
 9453                    except ValueError as e:
 9454                        ped_samples = ped.split(",")
 9455                        ped = {}
 9456                        for ped_sample in ped_samples:
 9457                            ped[ped_sample] = ped_sample
 9458
 9459                # Pedigree is a dict
 9460                elif isinstance(ped, dict):
 9461                    log.debug("Pedigree is dict")
 9462
 9463                # Pedigree is not well formatted
 9464                else:
 9465                    msg_error = "Pedigree not well formatted"
 9466                    log.error(msg_error)
 9467                    raise ValueError(msg_error)
 9468
 9469                # Construct list
 9470                ped_samples = list(ped.values())
 9471
 9472            else:
 9473                log.debug("Pedigree not defined. Take all samples")
 9474                ped_samples = self.get_header_sample_list()
 9475                ped = {}
 9476                for ped_sample in ped_samples:
 9477                    ped[ped_sample] = ped_sample
 9478
 9479            # Check pedigree
 9480            if not ped or len(ped) == 0:
 9481                msg_error = f"Error in pedigree: samples {ped_samples}"
 9482                log.error(msg_error)
 9483                raise ValueError(msg_error)
 9484
 9485            # Log
 9486            log.info(
 9487                "Calculation 'BARCODEFAMILY' - Samples: "
 9488                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
 9489            )
 9490            log.debug(f"ped_samples={ped_samples}")
 9491
 9492            # Field
 9493            barcode_infos = prefix + tag
 9494
 9495            # Variants table
 9496            table_variants = self.get_table_variants()
 9497
 9498            # Header
 9499            vcf_reader = self.get_header()
 9500
 9501            # Create variant id
 9502            variant_id_column = self.get_variant_id_column()
 9503            added_columns = [variant_id_column]
 9504
 9505            # variant_id, FORMAT and samples
 9506            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9507                ped_samples
 9508            )
 9509
 9510            # Create dataframe
 9511            dataframe_barcode = self.get_query_to_df(
 9512                f""" SELECT {samples_fields} FROM {table_variants} """
 9513            )
 9514
 9515            # Create barcode column
 9516            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9517                lambda row: barcode(row, samples=ped_samples), axis=1
 9518            )
 9519
 9520            # Add barcode family to header
 9521            # Add vaf_normalization to header
 9522            vcf_reader.formats[tag] = vcf.parser._Format(
 9523                id=tag,
 9524                num=".",
 9525                type="String",
 9526                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
 9527                type_code=self.code_type_map.get("String"),
 9528            )
 9529            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
 9530                id=f"{tag}S",
 9531                num=".",
 9532                type="String",
 9533                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
 9534                type_code=self.code_type_map.get("String"),
 9535            )
 9536
 9537            # Update
 9538            # for sample in ped_samples:
 9539            sql_update_set = []
 9540            for sample in self.get_header_sample_list() + ["FORMAT"]:
 9541                if sample in ped_samples:
 9542                    value = f'dataframe_barcode."{barcode_infos}"'
 9543                    value_samples = "'" + ",".join(ped_samples) + "'"
 9544                elif sample == "FORMAT":
 9545                    value = f"'{tag}'"
 9546                    value_samples = f"'{tag}S'"
 9547                else:
 9548                    value = "'.'"
 9549                    value_samples = "'.'"
 9550                format_regex = r"[a-zA-Z0-9\s]"
 9551                sql_update_set.append(
 9552                    f"""
 9553                        "{sample}" = 
 9554                        concat(
 9555                            CASE
 9556                                WHEN {table_variants}."{sample}" = './.'
 9557                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
 9558                                ELSE {table_variants}."{sample}"
 9559                            END,
 9560                            ':',
 9561                            {value},
 9562                            ':',
 9563                            {value_samples}
 9564                        )
 9565                    """
 9566                )
 9567
 9568            sql_update_set_join = ", ".join(sql_update_set)
 9569            sql_update = f"""
 9570                UPDATE {table_variants}
 9571                SET {sql_update_set_join}
 9572                FROM dataframe_barcode
 9573                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9574            """
 9575            self.conn.execute(sql_update)
 9576
 9577            # Remove added columns
 9578            for added_column in added_columns:
 9579                self.drop_column(column=added_column)
 9580
 9581            # Delete dataframe
 9582            del dataframe_barcode
 9583            gc.collect()
 9584
 9585    def calculation_trio(self) -> None:
 9586        """
 9587        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
 9588        information to the INFO field of each variant.
 9589        """
 9590
 9591        # if FORMAT and samples
 9592        if (
 9593            "FORMAT" in self.get_header_columns_as_list()
 9594            and self.get_header_sample_list()
 9595        ):
 9596
 9597            # trio annotation field
 9598            trio_tag = "trio"
 9599
 9600            # VCF infos tags
 9601            vcf_infos_tags = {
 9602                "trio": "trio calculation",
 9603            }
 9604
 9605            # Param
 9606            param = self.get_param()
 9607
 9608            # Prefix
 9609            prefix = self.get_explode_infos_prefix()
 9610
 9611            # Trio param
 9612            trio_ped = (
 9613                param.get("calculation", {})
 9614                .get("calculations", {})
 9615                .get("TRIO", {})
 9616                .get("trio_pedigree", None)
 9617            )
 9618
 9619            # Load trio
 9620            if trio_ped:
 9621
 9622                # Trio pedigree is a file
 9623                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
 9624                    log.debug("TRIO pedigree is file")
 9625                    with open(full_path(trio_ped)) as trio_ped:
 9626                        trio_ped = yaml.safe_load(trio_ped)
 9627
 9628                # Trio pedigree is a string
 9629                elif isinstance(trio_ped, str):
 9630                    log.debug("TRIO pedigree is str")
 9631                    try:
 9632                        trio_ped = json.loads(trio_ped)
 9633                        log.debug("TRIO pedigree is json str")
 9634                    except ValueError as e:
 9635                        trio_samples = trio_ped.split(",")
 9636                        if len(trio_samples) == 3:
 9637                            trio_ped = {
 9638                                "father": trio_samples[0],
 9639                                "mother": trio_samples[1],
 9640                                "child": trio_samples[2],
 9641                            }
 9642                            log.debug("TRIO pedigree is list str")
 9643                        else:
 9644                            msg_error = "TRIO pedigree not well formatted"
 9645                            log.error(msg_error)
 9646                            raise ValueError(msg_error)
 9647
 9648                # Trio pedigree is a dict
 9649                elif isinstance(trio_ped, dict):
 9650                    log.debug("TRIO pedigree is dict")
 9651
 9652                # Trio pedigree is not well formatted
 9653                else:
 9654                    msg_error = "TRIO pedigree not well formatted"
 9655                    log.error(msg_error)
 9656                    raise ValueError(msg_error)
 9657
 9658                # Construct trio list
 9659                trio_samples = [
 9660                    trio_ped.get("father", ""),
 9661                    trio_ped.get("mother", ""),
 9662                    trio_ped.get("child", ""),
 9663                ]
 9664
 9665            else:
 9666                log.debug("TRIO pedigree not defined. Take the first 3 samples")
 9667                samples_list = self.get_header_sample_list()
 9668                if len(samples_list) >= 3:
 9669                    trio_samples = self.get_header_sample_list()[0:3]
 9670                    trio_ped = {
 9671                        "father": trio_samples[0],
 9672                        "mother": trio_samples[1],
 9673                        "child": trio_samples[2],
 9674                    }
 9675                else:
 9676                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
 9677                    log.error(msg_error)
 9678                    raise ValueError(msg_error)
 9679
 9680            # Check trio pedigree
 9681            if not trio_ped or len(trio_ped) != 3:
 9682                msg_error = f"Error in TRIO pedigree: {trio_ped}"
 9683                log.error(msg_error)
 9684                raise ValueError(msg_error)
 9685
 9686            # Log
 9687            log.info(
 9688                f"Calculation 'TRIO' - Samples: "
 9689                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
 9690            )
 9691
 9692            # Field
 9693            trio_infos = prefix + trio_tag
 9694
 9695            # Variants table
 9696            table_variants = self.get_table_variants()
 9697
 9698            # Header
 9699            vcf_reader = self.get_header()
 9700
 9701            # Create variant id
 9702            variant_id_column = self.get_variant_id_column()
 9703            added_columns = [variant_id_column]
 9704
 9705            # variant_id, FORMAT and samples
 9706            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9707                self.get_header_sample_list()
 9708            )
 9709
 9710            # Create dataframe
 9711            dataframe_trio = self.get_query_to_df(
 9712                f""" SELECT {samples_fields} FROM {table_variants} """
 9713            )
 9714
 9715            # Create trio column
 9716            dataframe_trio[trio_infos] = dataframe_trio.apply(
 9717                lambda row: trio(row, samples=trio_samples), axis=1
 9718            )
 9719
 9720            # Add trio to header
 9721            vcf_reader.infos[trio_tag] = vcf.parser._Info(
 9722                trio_tag,
 9723                ".",
 9724                "String",
 9725                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
 9726                "howard calculation",
 9727                "0",
 9728                self.code_type_map.get("String"),
 9729            )
 9730
 9731            # Update
 9732            sql_update = f"""
 9733                UPDATE {table_variants}
 9734                SET "INFO" = 
 9735                    concat(
 9736                        CASE
 9737                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9738                            THEN ''
 9739                            ELSE concat("INFO", ';')
 9740                        END,
 9741                        CASE
 9742                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
 9743                             AND dataframe_trio."{trio_infos}" NOT NULL
 9744                            THEN concat(
 9745                                    '{trio_tag}=',
 9746                                    dataframe_trio."{trio_infos}"
 9747                                )
 9748                            ELSE ''
 9749                        END
 9750                    )
 9751                FROM dataframe_trio
 9752                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
 9753            """
 9754            self.conn.execute(sql_update)
 9755
 9756            # Remove added columns
 9757            for added_column in added_columns:
 9758                self.drop_column(column=added_column)
 9759
 9760            # Delete dataframe
 9761            del dataframe_trio
 9762            gc.collect()
 9763
 9764    def calculation_vaf_normalization(self) -> None:
 9765        """
 9766        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
 9767        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
 9768        :return: The function does not return anything.
 9769        """
 9770
 9771        # if FORMAT and samples
 9772        if (
 9773            "FORMAT" in self.get_header_columns_as_list()
 9774            and self.get_header_sample_list()
 9775        ):
 9776
 9777            # vaf_normalization annotation field
 9778            vaf_normalization_tag = "VAF"
 9779
 9780            # VCF infos tags
 9781            vcf_infos_tags = {
 9782                "VAF": "VAF Variant Frequency",
 9783            }
 9784
 9785            # Prefix
 9786            prefix = self.get_explode_infos_prefix()
 9787
 9788            # Variants table
 9789            table_variants = self.get_table_variants()
 9790
 9791            # Header
 9792            vcf_reader = self.get_header()
 9793
 9794            # Do not calculate if VAF already exists
 9795            if "VAF" in vcf_reader.formats:
 9796                log.debug("VAF already on genotypes")
 9797                return
 9798
 9799            # Create variant id
 9800            variant_id_column = self.get_variant_id_column()
 9801            added_columns = [variant_id_column]
 9802
 9803            # variant_id, FORMAT and samples
 9804            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9805                f""" "{sample}" """ for sample in self.get_header_sample_list()
 9806            )
 9807
 9808            # Create dataframe
 9809            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
 9810            log.debug(f"query={query}")
 9811            dataframe_vaf_normalization = self.get_query_to_df(query=query)
 9812
 9813            vaf_normalization_set = []
 9814
 9815            # for each sample vaf_normalization
 9816            for sample in self.get_header_sample_list():
 9817                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
 9818                    lambda row: vaf_normalization(row, sample=sample), axis=1
 9819                )
 9820                vaf_normalization_set.append(
 9821                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
 9822                )
 9823
 9824            # Add VAF to FORMAT
 9825            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
 9826                "FORMAT"
 9827            ].apply(lambda x: str(x) + ":VAF")
 9828            vaf_normalization_set.append(
 9829                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
 9830            )
 9831
 9832            # Add vaf_normalization to header
 9833            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
 9834                id=vaf_normalization_tag,
 9835                num="1",
 9836                type="Float",
 9837                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
 9838                type_code=self.code_type_map.get("Float"),
 9839            )
 9840
 9841            # Create fields to add in INFO
 9842            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
 9843
 9844            # Update
 9845            sql_update = f"""
 9846                UPDATE {table_variants}
 9847                SET {sql_vaf_normalization_set}
 9848                FROM dataframe_vaf_normalization
 9849                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
 9850
 9851            """
 9852            self.conn.execute(sql_update)
 9853
 9854            # Remove added columns
 9855            for added_column in added_columns:
 9856                self.drop_column(column=added_column)
 9857
 9858            # Delete dataframe
 9859            del dataframe_vaf_normalization
 9860            gc.collect()
 9861
 9862    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9863        """
 9864        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9865        field in a VCF file and updates the INFO column of the variants table with the calculated
 9866        statistics.
 9867
 9868        :param info: The `info` parameter is a string that represents the type of information for which
 9869        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9870        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9871        maximum value, the mean, the median, defaults to VAF
 9872        :type info: str (optional)
 9873        """
 9874
 9875        # if FORMAT and samples
 9876        if (
 9877            "FORMAT" in self.get_header_columns_as_list()
 9878            and self.get_header_sample_list()
 9879        ):
 9880
 9881            # vaf_stats annotation field
 9882            vaf_stats_tag = info + "_stats"
 9883
 9884            # VCF infos tags
 9885            vcf_infos_tags = {
 9886                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9887                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9888                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9889                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9890                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9891                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9892                info
 9893                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9894            }
 9895
 9896            # Prefix
 9897            prefix = self.get_explode_infos_prefix()
 9898
 9899            # Field
 9900            vaf_stats_infos = prefix + vaf_stats_tag
 9901
 9902            # Variants table
 9903            table_variants = self.get_table_variants()
 9904
 9905            # Header
 9906            vcf_reader = self.get_header()
 9907
 9908            # Create variant id
 9909            variant_id_column = self.get_variant_id_column()
 9910            added_columns = [variant_id_column]
 9911
 9912            # variant_id, FORMAT and samples
 9913            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9914                self.get_header_sample_list()
 9915            )
 9916
 9917            # Create dataframe
 9918            dataframe_vaf_stats = self.get_query_to_df(
 9919                f""" SELECT {samples_fields} FROM {table_variants} """
 9920            )
 9921
 9922            # Create vaf_stats column
 9923            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
 9924                lambda row: genotype_stats(
 9925                    row, samples=self.get_header_sample_list(), info=info
 9926                ),
 9927                axis=1,
 9928            )
 9929
 9930            # List of vcf tags
 9931            sql_vaf_stats_fields = []
 9932
 9933            # Check all VAF stats infos
 9934            for stat in vcf_infos_tags:
 9935
 9936                # Extract stats
 9937                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
 9938                    lambda x: dict(x).get(stat, "")
 9939                )
 9940
 9941                # Add snpeff_hgvs to header
 9942                vcf_reader.infos[stat] = vcf.parser._Info(
 9943                    stat,
 9944                    ".",
 9945                    "String",
 9946                    vcf_infos_tags.get(stat, "genotype statistics"),
 9947                    "howard calculation",
 9948                    "0",
 9949                    self.code_type_map.get("String"),
 9950                )
 9951
 9952                if len(sql_vaf_stats_fields):
 9953                    sep = ";"
 9954                else:
 9955                    sep = ""
 9956
 9957                # Create fields to add in INFO
 9958                sql_vaf_stats_fields.append(
 9959                    f"""
 9960                        CASE
 9961                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
 9962                            THEN concat(
 9963                                    '{sep}{stat}=',
 9964                                    dataframe_vaf_stats."{stat}"
 9965                                )
 9966                            ELSE ''
 9967                        END
 9968                    """
 9969                )
 9970
 9971            # SQL set for update
 9972            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
 9973
 9974            # Update
 9975            sql_update = f"""
 9976                UPDATE {table_variants}
 9977                SET "INFO" = 
 9978                    concat(
 9979                        CASE
 9980                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9981                            THEN ''
 9982                            ELSE concat("INFO", ';')
 9983                        END,
 9984                        {sql_vaf_stats_fields_set}
 9985                    )
 9986                FROM dataframe_vaf_stats
 9987                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
 9988
 9989            """
 9990            self.conn.execute(sql_update)
 9991
 9992            # Remove added columns
 9993            for added_column in added_columns:
 9994                self.drop_column(column=added_column)
 9995
 9996            # Delete dataframe
 9997            del dataframe_vaf_stats
 9998            gc.collect()
 9999
10000    def calculation_transcripts_annotation(
10001        self, info_json: str = None, info_format: str = None
10002    ) -> None:
10003        """
10004        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
10005        field to it if transcripts are available.
10006
10007        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
10008        is a string parameter that represents the information field to be used in the transcripts JSON.
10009        It is used to specify the JSON format for the transcripts information. If no value is provided
10010        when calling the method, it defaults to "
10011        :type info_json: str
10012        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
10013        method is a string parameter that specifies the format of the information field to be used in
10014        the transcripts JSON. It is used to define the format of the information field
10015        :type info_format: str
10016        """
10017
10018        # Create transcripts table
10019        transcripts_table = self.create_transcript_view()
10020
10021        # Add info field
10022        if transcripts_table:
10023            self.transcript_view_to_variants(
10024                transcripts_table=transcripts_table,
10025                transcripts_info_field_json=info_json,
10026                transcripts_info_field_format=info_format,
10027            )
10028        else:
10029            log.info("No Transcripts to process. Check param.json file configuration")
10030
10031    def calculation_transcripts_prioritization(self) -> None:
10032        """
10033        The function `calculation_transcripts_prioritization` creates a transcripts table and
10034        prioritizes transcripts based on certain criteria.
10035        """
10036
10037        # Create transcripts table
10038        transcripts_table = self.create_transcript_view()
10039
10040        # Add info field
10041        if transcripts_table:
10042            self.transcripts_prioritization(transcripts_table=transcripts_table)
10043        else:
10044            log.info("No Transcripts to process. Check param.json file configuration")
10045
10046    def calculation_transcripts_export(self) -> None:
10047        """ """
10048
10049        # Create transcripts table
10050        transcripts_table = self.create_transcript_view()
10051
10052        # Add info field
10053        if transcripts_table:
10054            self.transcripts_export(transcripts_table=transcripts_table)
10055        else:
10056            log.info("No Transcripts to process. Check param.json file configuration")
10057
10058    ###############
10059    # Transcripts #
10060    ###############
10061
10062    def transcripts_export(
10063        self, transcripts_table: str = None, param: dict = {}
10064    ) -> bool:
10065        """ """
10066
10067        log.debug("Start transcripts export...")
10068
10069        # Param
10070        if not param:
10071            param = self.get_param()
10072
10073        # Param export
10074        param_transcript_export = param.get("transcripts", {}).get("export", {})
10075
10076        # Output file
10077        transcripts_export_output = param_transcript_export.get("output", None)
10078
10079        if not param_transcript_export or not transcripts_export_output:
10080            log.warning(f"No transcriipts export parameters defined!")
10081            return False
10082
10083        # List of transcripts annotations
10084        query_describe = f"""
10085            SELECT column_name
10086            FROM (
10087                    DESCRIBE SELECT * FROM {transcripts_table}
10088                )
10089            WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO')
10090        """
10091        transcripts_annotations_list = list(
10092            self.get_query_to_df(query=query_describe)["column_name"]
10093        )
10094
10095        # Create transcripts table for export
10096        transcripts_table_export = f"{transcripts_table}_export_" + "".join(
10097            random.choices(string.ascii_uppercase + string.digits, k=10)
10098        )
10099        query_create_transcripts_table_export = f"""
10100            CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table})
10101        """
10102        self.execute_query(query=query_create_transcripts_table_export)
10103
10104        # Output file format
10105        transcripts_export_output_format = get_file_format(
10106            filename=transcripts_export_output
10107        )
10108
10109        # Format VCF - construct INFO
10110        if transcripts_export_output_format in ["vcf"]:
10111
10112            # Construct query update INFO and header
10113            query_update_info = []
10114            for field in transcripts_annotations_list:
10115
10116                # If field not in header
10117                if field not in self.get_header_infos_list():
10118
10119                    # Add PZ Transcript in header
10120                    self.get_header().infos[field] = vcf.parser._Info(
10121                        field,
10122                        ".",
10123                        "String",
10124                        f"Annotation '{field}' from transcript view",
10125                        "unknown",
10126                        "unknown",
10127                        0,
10128                    )
10129
10130                # Add field as INFO/tag
10131                query_update_info.append(
10132                    f"""
10133                        CASE
10134                            WHEN "{field}" IS NOT NULL
10135                            THEN concat('{field}=', "{field}", ';')    
10136                            ELSE ''     
10137                        END
10138                        """
10139                )
10140
10141            # Query param
10142            query_update_info_value = (
10143                f""" concat('',  {", ".join(query_update_info)}) """
10144            )
10145            query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """
10146
10147        else:
10148
10149            # Query param
10150            query_update_info_value = f""" NULL """
10151            query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """
10152
10153        # Update query INFO column
10154        query_update = f"""
10155            UPDATE {transcripts_table_export}
10156            SET INFO = {query_update_info_value}
10157
10158        """
10159        self.execute_query(query=query_update)
10160
10161        # Export
10162        self.export_output(
10163            output_file=transcripts_export_output,
10164            query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """,
10165        )
10166
10167        # Drop transcripts export table
10168        query_drop_transcripts_table_export = f"""
10169            DROP TABLE {transcripts_table_export}
10170        """
10171        self.execute_query(query=query_drop_transcripts_table_export)
10172
10173    def transcripts_prioritization(
10174        self, transcripts_table: str = None, param: dict = {}
10175    ) -> bool:
10176        """
10177        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
10178        and updates the variants table with the prioritized information.
10179
10180        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10181        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
10182        This parameter is used to identify the table where the transcripts data is stored for the
10183        prioritization process
10184        :type transcripts_table: str
10185        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
10186        that contains various configuration settings for the prioritization process of transcripts. It
10187        is used to customize the behavior of the prioritization algorithm and includes settings such as
10188        the prefix for prioritization fields, default profiles, and other
10189        :type param: dict
10190        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
10191        transcripts prioritization process is successfully completed, and `False` if there are any
10192        issues or if no profile is defined for transcripts prioritization.
10193        """
10194
10195        log.debug("Start transcripts prioritization...")
10196
10197        # Param
10198        if not param:
10199            param = self.get_param()
10200
10201        # Variants table
10202        table_variants = self.get_table_variants()
10203
10204        # Transcripts table
10205        if transcripts_table is None:
10206            transcripts_table = self.create_transcript_view(
10207                transcripts_table="transcripts", param=param
10208            )
10209        if transcripts_table is None:
10210            msg_err = "No Transcripts table availalble"
10211            log.error(msg_err)
10212            raise ValueError(msg_err)
10213        log.debug(f"transcripts_table={transcripts_table}")
10214
10215        # Get transcripts columns
10216        columns_as_list_query = f"""
10217            DESCRIBE {transcripts_table}
10218        """
10219        columns_as_list = list(
10220            self.get_query_to_df(columns_as_list_query)["column_name"]
10221        )
10222
10223        # Create INFO if not exists
10224        if "INFO" not in columns_as_list:
10225            query_add_info = f"""
10226                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
10227            """
10228            self.execute_query(query_add_info)
10229
10230        # Prioritization param and Force only PZ Score and Flag
10231        pz_param = param.get("transcripts", {}).get("prioritization", {})
10232
10233        # PZ profile by default
10234        pz_profile_default = (
10235            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
10236        )
10237
10238        # Exit if no profile
10239        if pz_profile_default is None:
10240            log.warning("No profile defined for transcripts prioritization")
10241            return False
10242
10243        # PZ fields
10244        pz_param_pzfields = {}
10245
10246        # PZ field transcripts
10247        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
10248
10249        # Add PZ Transcript in header
10250        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
10251            pz_fields_transcripts,
10252            ".",
10253            "String",
10254            f"Transcript selected from prioritization process, profile {pz_profile_default}",
10255            "unknown",
10256            "unknown",
10257            code_type_map["String"],
10258        )
10259
10260        # Mandatory fields
10261        pz_mandatory_fields_list = [
10262            "Score",
10263            "Flag",
10264            "Tags",
10265            "Comment",
10266            "Infos",
10267            "Class",
10268        ]
10269        pz_mandatory_fields = []
10270        for pz_mandatory_field in pz_mandatory_fields_list:
10271            pz_mandatory_fields.append(
10272                pz_param.get("pzprefix", "PTZ") + pz_mandatory_field
10273            )
10274
10275        # PZ fields in param
10276        for pz_field in pz_param.get("pzfields", []):
10277            if pz_field in pz_mandatory_fields_list:
10278                pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = (
10279                    pz_param.get("pzprefix", "PTZ") + pz_field
10280                )
10281            else:
10282                pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field
10283                pz_param_pzfields[pz_field] = pz_field_new
10284
10285                # Add PZ Transcript in header
10286                self.get_header().infos[pz_field_new] = vcf.parser._Info(
10287                    pz_field_new,
10288                    ".",
10289                    "String",
10290                    f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}",
10291                    "unknown",
10292                    "unknown",
10293                    code_type_map["String"],
10294                )
10295
10296        # PZ fields param
10297        pz_param["pzfields"] = pz_mandatory_fields
10298
10299        # Prioritization
10300        prioritization_result = self.prioritization(
10301            table=transcripts_table,
10302            pz_param=param.get("transcripts", {}).get("prioritization", {}),
10303        )
10304        if not prioritization_result:
10305            log.warning("Transcripts prioritization not processed")
10306            return False
10307
10308        # PZ fields sql query
10309        query_update_select_list = []
10310        query_update_concat_list = []
10311        query_update_order_list = []
10312        for pz_param_pzfield in set(
10313            list(pz_param_pzfields.keys()) + pz_mandatory_fields
10314        ):
10315            query_update_select_list.append(f" {pz_param_pzfield}, ")
10316
10317        for pz_param_pzfield in pz_param_pzfields:
10318            query_update_concat_list.append(
10319                f"""
10320                    , CASE 
10321                        WHEN {pz_param_pzfield} IS NOT NULL
10322                        THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield})
10323                        ELSE ''
10324                    END
10325                """
10326            )
10327
10328        # Order by
10329        pz_orders = (
10330            param.get("transcripts", {})
10331            .get("prioritization", {})
10332            .get("prioritization_transcripts_order", {})
10333        )
10334        if not pz_orders:
10335            pz_orders = {
10336                pz_param.get("pzprefix", "PTZ") + "Flag": "DESC",
10337                pz_param.get("pzprefix", "PTZ") + "Score": "DESC",
10338            }
10339        for pz_order in pz_orders:
10340            query_update_order_list.append(
10341                f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """
10342            )
10343
10344        # Fields to explode
10345        fields_to_explode = (
10346            list(pz_param_pzfields.keys())
10347            + pz_mandatory_fields
10348            + list(pz_orders.keys())
10349        )
10350        # Remove transcript column as a specific transcript column
10351        if "transcript" in fields_to_explode:
10352            fields_to_explode.remove("transcript")
10353
10354        # Fields intranscripts table
10355        query_transcripts_table = f"""
10356            DESCRIBE SELECT * FROM {transcripts_table}
10357        """
10358        query_transcripts_table = self.get_query_to_df(query=query_transcripts_table)
10359
10360        # Check fields to explode
10361        for field_to_explode in fields_to_explode:
10362            if field_to_explode not in self.get_header_infos_list() + list(
10363                query_transcripts_table.column_name
10364            ):
10365                msg_err = f"INFO/{field_to_explode} NOT IN header"
10366                log.error(msg_err)
10367                raise ValueError(msg_err)
10368
10369        # Explode fields to explode
10370        self.explode_infos(
10371            table=transcripts_table,
10372            fields=fields_to_explode,
10373        )
10374
10375        # Transcript preference file
10376        transcripts_preference_file = (
10377            param.get("transcripts", {})
10378            .get("prioritization", {})
10379            .get("prioritization_transcripts", {})
10380        )
10381        transcripts_preference_file = full_path(transcripts_preference_file)
10382
10383        # Transcript preference forced
10384        transcript_preference_force = (
10385            param.get("transcripts", {})
10386            .get("prioritization", {})
10387            .get("prioritization_transcripts_force", False)
10388        )
10389        # Transcript version forced
10390        transcript_version_force = (
10391            param.get("transcripts", {})
10392            .get("prioritization", {})
10393            .get("prioritization_transcripts_version_force", False)
10394        )
10395
10396        # Transcripts Ranking
10397        if transcripts_preference_file:
10398
10399            # Transcripts file to dataframe
10400            if os.path.exists(transcripts_preference_file):
10401                transcripts_preference_dataframe = transcripts_file_to_df(
10402                    transcripts_preference_file
10403                )
10404            else:
10405                log.error(
10406                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10407                )
10408                raise ValueError(
10409                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10410                )
10411
10412            # Order by depending to transcript preference forcing
10413            if transcript_preference_force:
10414                order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """
10415            else:
10416                order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """
10417
10418            # Transcript columns joined depend on version consideration
10419            if transcript_version_force:
10420                transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """
10421            else:
10422                transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """
10423
10424            # Query ranking for update
10425            query_update_ranking = f"""
10426                SELECT
10427                    "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)}
10428                    ROW_NUMBER() OVER (
10429                        PARTITION BY "#CHROM", POS, REF, ALT
10430                        ORDER BY {order_by}
10431                    ) AS rn
10432                FROM {transcripts_table}
10433                LEFT JOIN 
10434                    (
10435                        SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order
10436                        FROM transcripts_preference_dataframe
10437                    ) AS transcripts_preference
10438                ON {transcripts_version_join}
10439            """
10440
10441        else:
10442
10443            # Query ranking for update
10444            query_update_ranking = f"""
10445                SELECT
10446                    "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)}
10447                    ROW_NUMBER() OVER (
10448                        PARTITION BY "#CHROM", POS, REF, ALT
10449                        ORDER BY {" , ".join(query_update_order_list)}
10450                    ) AS rn
10451                FROM {transcripts_table}
10452            """
10453
10454        # Export Transcripts prioritization infos to variants table
10455        query_update = f"""
10456            WITH RankedTranscripts AS (
10457                {query_update_ranking}
10458            )
10459            UPDATE {table_variants}
10460                SET
10461                INFO = CONCAT(CASE
10462                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10463                            THEN ''
10464                            ELSE concat("INFO", ';')
10465                        END,
10466                        concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)})
10467                        )
10468            FROM
10469                RankedTranscripts
10470            WHERE
10471                rn = 1
10472                AND variants."#CHROM" = RankedTranscripts."#CHROM"
10473                AND variants."POS" = RankedTranscripts."POS"
10474                AND variants."REF" = RankedTranscripts."REF"
10475                AND variants."ALT" = RankedTranscripts."ALT"     
10476        """
10477
10478        # log.debug(f"query_update={query_update}")
10479        self.execute_query(query=query_update)
10480
10481        # Return
10482        return True
10483
10484    def create_transcript_view_from_columns_map(
10485        self,
10486        transcripts_table: str = "transcripts",
10487        columns_maps: dict = {},
10488        added_columns: list = [],
10489        temporary_tables: list = None,
10490        annotation_fields: list = None,
10491        column_rename: dict = {},
10492        column_clean: bool = False,
10493        column_case: str = None,
10494    ) -> tuple[list, list, list]:
10495        """
10496        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
10497        specified columns mapping for transcripts data.
10498
10499        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10500        of the table where the transcripts data is stored or will be stored in the database. This table
10501        typically contains information about transcripts such as Ensembl transcript IDs, gene names,
10502        scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
10503        :type transcripts_table: str (optional)
10504        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information
10505        about how to map columns from a transcripts table to create a view. Each entry in the
10506        `columns_maps` list represents a mapping configuration for a specific set of columns. It
10507        typically includes details such as the main transcript column and additional information columns
10508        :type columns_maps: dict
10509        :param added_columns: The `added_columns` parameter in the
10510        `create_transcript_view_from_columns_map` function is a list that stores the additional columns
10511        that will be added to the view being created based on the columns map provided. These columns
10512        are generated by exploding the transcript information columns along with the main transcript
10513        column
10514        :type added_columns: list
10515        :param temporary_tables: The `temporary_tables` parameter in the
10516        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
10517        tables created during the process of creating a transcript view from a columns map. These
10518        temporary tables are used to store intermediate results or transformations before the final view
10519        is generated
10520        :type temporary_tables: list
10521        :param annotation_fields: The `annotation_fields` parameter in the
10522        `create_transcript_view_from_columns_map` function is a list that stores the fields that are
10523        used for annotation in the query view creation process. These fields are extracted from the
10524        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
10525        :type annotation_fields: list
10526        :param column_rename: The `column_rename` parameter in the
10527        `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify
10528        custom renaming for columns during the creation of the temporary table view. This parameter
10529        provides a mapping of original column names to the desired renamed column names. By using this
10530        parameter,
10531        :type column_rename: dict
10532        :param column_clean: The `column_clean` parameter in the
10533        `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the
10534        column values should be cleaned or not. If set to `True`, the column values will be cleaned by
10535        removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to
10536        False
10537        :type column_clean: bool (optional)
10538        :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map`
10539        function is used to specify the case transformation to be applied to the columns during the view
10540        creation process. It allows you to control whether the column values should be converted to
10541        lowercase, uppercase, or remain unchanged
10542        :type column_case: str
10543        :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three
10544        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
10545        """
10546
10547        log.debug("Start transcrpts view creation from columns map...")
10548
10549        # "from_columns_map": [
10550        #     {
10551        #         "transcripts_column": "Ensembl_transcriptid",
10552        #         "transcripts_infos_columns": [
10553        #             "genename",
10554        #             "Ensembl_geneid",
10555        #             "LIST_S2_score",
10556        #             "LIST_S2_pred",
10557        #         ],
10558        #     },
10559        #     {
10560        #         "transcripts_column": "Ensembl_transcriptid",
10561        #         "transcripts_infos_columns": [
10562        #             "genename",
10563        #             "VARITY_R_score",
10564        #             "Aloft_pred",
10565        #         ],
10566        #     },
10567        # ],
10568
10569        # Init
10570        if temporary_tables is None:
10571            temporary_tables = []
10572        if annotation_fields is None:
10573            annotation_fields = []
10574
10575        # Variants table
10576        table_variants = self.get_table_variants()
10577
10578        for columns_map in columns_maps:
10579
10580            # Transcript column
10581            transcripts_column = columns_map.get("transcripts_column", None)
10582
10583            # Transcripts infos columns
10584            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
10585
10586            # Transcripts infos columns rename
10587            column_rename = columns_map.get("column_rename", column_rename)
10588
10589            # Transcripts infos columns clean
10590            column_clean = columns_map.get("column_clean", column_clean)
10591
10592            # Transcripts infos columns case
10593            column_case = columns_map.get("column_case", column_case)
10594
10595            if transcripts_column is not None:
10596
10597                # Explode
10598                added_columns += self.explode_infos(
10599                    fields=[transcripts_column] + transcripts_infos_columns
10600                )
10601
10602                # View clauses
10603                clause_select_variants = []
10604                clause_select_tanscripts = []
10605                for field in [transcripts_column] + transcripts_infos_columns:
10606
10607                    # AS field
10608                    as_field = field
10609
10610                    # Rename
10611                    if column_rename:
10612                        as_field = column_rename.get(as_field, as_field)
10613
10614                    # Clean
10615                    if column_clean:
10616                        as_field = clean_annotation_field(as_field)
10617
10618                    # Case
10619                    if column_case:
10620                        if column_case.lower() in ["lower"]:
10621                            as_field = as_field.lower()
10622                        elif column_case.lower() in ["upper"]:
10623                            as_field = as_field.upper()
10624
10625                    # Clause select Variants
10626                    clause_select_variants.append(
10627                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10628                    )
10629
10630                    if field in [transcripts_column]:
10631                        clause_select_tanscripts.append(
10632                            f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10633                        )
10634                    else:
10635                        clause_select_tanscripts.append(
10636                            f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """
10637                        )
10638                        annotation_fields.append(as_field)
10639
10640                # Querey View
10641                query = f""" 
10642                    SELECT
10643                        "#CHROM", POS, REF, ALT, INFO,
10644                        "{transcripts_column}" AS 'transcript',
10645                        {", ".join(clause_select_tanscripts)}
10646                    FROM (
10647                        SELECT 
10648                            "#CHROM", POS, REF, ALT, INFO,
10649                            {", ".join(clause_select_variants)}
10650                        FROM {table_variants}
10651                        )
10652                    WHERE "{transcripts_column}" IS NOT NULL
10653                """
10654
10655                # Create temporary table
10656                temporary_table = transcripts_table + "".join(
10657                    random.choices(string.ascii_uppercase + string.digits, k=10)
10658                )
10659
10660                # Temporary_tables
10661                temporary_tables.append(temporary_table)
10662                query_view = f"""
10663                    CREATE TEMPORARY TABLE {temporary_table}
10664                    AS ({query})
10665                """
10666                self.execute_query(query=query_view)
10667
10668        return added_columns, temporary_tables, annotation_fields
10669
10670    def create_transcript_view_from_column_format(
10671        self,
10672        transcripts_table: str = "transcripts",
10673        column_formats: dict = {},
10674        temporary_tables: list = None,
10675        annotation_fields: list = None,
10676        column_rename: dict = {},
10677        column_clean: bool = False,
10678        column_case: str = None,
10679    ) -> tuple[list, list, list]:
10680        """
10681        The `create_transcript_view_from_column_format` function generates a transcript view based on
10682        specified column formats, adds additional columns and annotation fields, and returns the list of
10683        temporary tables and annotation fields.
10684
10685        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10686        of the table containing the transcripts data. This table will be used as the base table for
10687        creating the transcript view. The default value for this parameter is "transcripts", but you can
10688        provide a different table name if needed, defaults to transcripts
10689        :type transcripts_table: str (optional)
10690        :param column_formats: The `column_formats` parameter is a dictionary that contains information
10691        about the columns to be used for creating the transcript view. Each entry in the dictionary
10692        specifies the mapping between a transcripts column and a transcripts infos column. This
10693        parameter allows you to define how the columns from the transcripts table should be transformed
10694        or mapped
10695        :type column_formats: dict
10696        :param temporary_tables: The `temporary_tables` parameter in the
10697        `create_transcript_view_from_column_format` function is a list that stores the names of
10698        temporary views created during the process of creating a transcript view from a column format.
10699        These temporary views are used to manipulate and extract data before generating the final
10700        transcript view
10701        :type temporary_tables: list
10702        :param annotation_fields: The `annotation_fields` parameter in the
10703        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
10704        that are extracted from the temporary views created during the process. These annotation fields
10705        are obtained by querying the temporary views and extracting the column names excluding specific
10706        columns like `#CH
10707        :type annotation_fields: list
10708        :param column_rename: The `column_rename` parameter in the
10709        `create_transcript_view_from_column_format` function is a dictionary that allows you to specify
10710        custom renaming of columns in the transcripts infos table. By providing a mapping of original
10711        column names to new column names in this dictionary, you can rename specific columns during the
10712        process
10713        :type column_rename: dict
10714        :param column_clean: The `column_clean` parameter in the
10715        `create_transcript_view_from_column_format` function is a boolean flag that determines whether
10716        the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns
10717        will be cleaned during the creation of the transcript view based on the specified column format,
10718        defaults to False
10719        :type column_clean: bool (optional)
10720        :param column_case: The `column_case` parameter in the
10721        `create_transcript_view_from_column_format` function is used to specify the case transformation
10722        to be applied to the columns in the transcript view. It can be set to either "upper" or "lower"
10723        to convert the column names to uppercase or lowercase, respectively
10724        :type column_case: str
10725        :return: The `create_transcript_view_from_column_format` function returns two lists:
10726        `temporary_tables` and `annotation_fields`.
10727        """
10728
10729        log.debug("Start transcrpts view creation from column format...")
10730
10731        #  "from_column_format": [
10732        #     {
10733        #         "transcripts_column": "ANN",
10734        #         "transcripts_infos_column": "Feature_ID",
10735        #     }
10736        # ],
10737
10738        # Init
10739        if temporary_tables is None:
10740            temporary_tables = []
10741        if annotation_fields is None:
10742            annotation_fields = []
10743
10744        for column_format in column_formats:
10745
10746            # annotation field and transcript annotation field
10747            annotation_field = column_format.get("transcripts_column", "ANN")
10748            transcript_annotation = column_format.get(
10749                "transcripts_infos_column", "Feature_ID"
10750            )
10751
10752            # Transcripts infos columns rename
10753            column_rename = column_format.get("column_rename", column_rename)
10754
10755            # Transcripts infos columns clean
10756            column_clean = column_format.get("column_clean", column_clean)
10757
10758            # Transcripts infos columns case
10759            column_case = column_format.get("column_case", column_case)
10760
10761            # Temporary View name
10762            temporary_view_name = transcripts_table + "".join(
10763                random.choices(string.ascii_uppercase + string.digits, k=10)
10764            )
10765
10766            # Create temporary view name
10767            temporary_view_name = self.annotation_format_to_table(
10768                uniquify=True,
10769                annotation_field=annotation_field,
10770                view_name=temporary_view_name,
10771                annotation_id=transcript_annotation,
10772                column_rename=column_rename,
10773                column_clean=column_clean,
10774                column_case=column_case,
10775            )
10776
10777            # Annotation fields
10778            if temporary_view_name:
10779                query_annotation_fields = f"""
10780                    SELECT *
10781                    FROM (
10782                        DESCRIBE SELECT *
10783                        FROM {temporary_view_name}
10784                        )
10785                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
10786                """
10787                df_annotation_fields = self.get_query_to_df(
10788                    query=query_annotation_fields
10789                )
10790
10791                # Add temporary view and annotation fields
10792                temporary_tables.append(temporary_view_name)
10793                annotation_fields += list(set(df_annotation_fields["column_name"]))
10794
10795        return temporary_tables, annotation_fields
10796
10797    def create_transcript_view(
10798        self,
10799        transcripts_table: str = None,
10800        transcripts_table_drop: bool = False,
10801        param: dict = {},
10802    ) -> str:
10803        """
10804        The `create_transcript_view` function generates a transcript view by processing data from a
10805        specified table based on provided parameters and structural information.
10806
10807        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
10808        is used to specify the name of the table that will store the final transcript view data. If a table
10809        name is not provided, the function will create a new table to store the transcript view data, and by
10810        default,, defaults to transcripts
10811        :type transcripts_table: str (optional)
10812        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
10813        `create_transcript_view` function is a boolean parameter that determines whether to drop the
10814        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
10815        the function will drop the existing transcripts table if it exists, defaults to False
10816        :type transcripts_table_drop: bool (optional)
10817        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
10818        contains information needed to create a transcript view. It includes details such as the structure
10819        of the transcripts, columns mapping, column formats, and other necessary information for generating
10820        the view. This parameter allows for flexibility and customization
10821        :type param: dict
10822        :return: The `create_transcript_view` function returns the name of the transcripts table that was
10823        created or modified during the execution of the function.
10824        """
10825
10826        log.debug("Start transcripts view creation...")
10827
10828        # Default
10829        transcripts_table_default = "transcripts"
10830
10831        # Param
10832        if not param:
10833            param = self.get_param()
10834
10835        # Struct
10836        struct = param.get("transcripts", {}).get("struct", None)
10837
10838        # Transcript veresion
10839        transcript_id_remove_version = param.get("transcripts", {}).get(
10840            "transcript_id_remove_version", False
10841        )
10842
10843        # Transcripts mapping
10844        transcript_id_mapping_file = param.get("transcripts", {}).get(
10845            "transcript_id_mapping_file", None
10846        )
10847
10848        # Transcripts mapping
10849        transcript_id_mapping_force = param.get("transcripts", {}).get(
10850            "transcript_id_mapping_force", None
10851        )
10852
10853        if struct:
10854
10855            # Transcripts table
10856            if transcripts_table is None:
10857                transcripts_table = param.get("transcripts", {}).get(
10858                    "table", transcripts_table_default
10859                )
10860
10861            # added_columns
10862            added_columns = []
10863
10864            # Temporary tables
10865            temporary_tables = []
10866
10867            # Annotation fields
10868            annotation_fields = []
10869
10870            # from columns map
10871            columns_maps = struct.get("from_columns_map", [])
10872            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10873                self.create_transcript_view_from_columns_map(
10874                    transcripts_table=transcripts_table,
10875                    columns_maps=columns_maps,
10876                    added_columns=added_columns,
10877                    temporary_tables=temporary_tables,
10878                    annotation_fields=annotation_fields,
10879                )
10880            )
10881            added_columns += added_columns_tmp
10882            temporary_tables += temporary_tables_tmp
10883            annotation_fields += annotation_fields_tmp
10884
10885            # from column format
10886            column_formats = struct.get("from_column_format", [])
10887            temporary_tables_tmp, annotation_fields_tmp = (
10888                self.create_transcript_view_from_column_format(
10889                    transcripts_table=transcripts_table,
10890                    column_formats=column_formats,
10891                    temporary_tables=temporary_tables,
10892                    annotation_fields=annotation_fields,
10893                )
10894            )
10895            temporary_tables += temporary_tables_tmp
10896            annotation_fields += annotation_fields_tmp
10897
10898            # Remove some specific fields/column
10899            annotation_fields = list(set(annotation_fields))
10900            for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]:
10901                if field in annotation_fields:
10902                    annotation_fields.remove(field)
10903
10904            # Merge temporary tables query
10905            query_merge = ""
10906            for temporary_table in list(set(temporary_tables)):
10907
10908                # First temporary table
10909                if not query_merge:
10910                    query_merge = f"""
10911                        SELECT * FROM {temporary_table}
10912                    """
10913                # other temporary table (using UNION)
10914                else:
10915                    query_merge += f"""
10916                        UNION BY NAME SELECT * FROM {temporary_table}
10917                    """
10918
10919            # transcript table tmp
10920            transcript_table_tmp = "transcripts_tmp"
10921            transcript_table_tmp2 = "transcripts_tmp2"
10922            transcript_table_tmp3 = "transcripts_tmp3"
10923
10924            # Merge on transcript
10925            query_merge_on_transcripts_annotation_fields = []
10926
10927            # Add transcript list
10928            query_merge_on_transcripts_annotation_fields.append(
10929                f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """
10930            )
10931
10932            # Aggregate all annotations fields
10933            for annotation_field in set(annotation_fields):
10934                query_merge_on_transcripts_annotation_fields.append(
10935                    f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """
10936                )
10937
10938            # Transcripts mapping
10939            if transcript_id_mapping_file:
10940
10941                # Transcript dataframe
10942                transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe"
10943                transcript_id_mapping_dataframe = transcripts_file_to_df(
10944                    transcript_id_mapping_file, column_names=["transcript", "alias"]
10945                )
10946
10947                # Transcript version remove
10948                if transcript_id_remove_version:
10949                    query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped"
10950                    query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)"
10951                    query_left_join = f"""
10952                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10953                    """
10954                else:
10955                    query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped"
10956                    query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript"
10957                    query_left_join = f"""
10958                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10959                    """
10960
10961                # Transcript column for group by merge
10962                query_transcript_merge_group_by = """
10963                        CASE
10964                            WHEN transcript_mapped NOT IN ('')
10965                            THEN split_part(transcript_mapped, '.', 1)
10966                            ELSE split_part(transcript_original, '.', 1)
10967                        END
10968                    """
10969
10970                # Merge query
10971                transcripts_tmp2_query = f"""
10972                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)}
10973                    FROM ({query_merge}) AS {transcript_table_tmp}
10974                    {query_left_join}
10975                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by}
10976                """
10977
10978                # Retrive columns after mege
10979                transcripts_tmp2_describe_query = f"""
10980                    DESCRIBE {transcripts_tmp2_query}
10981                """
10982                transcripts_tmp2_describe_list = list(
10983                    self.get_query_to_df(query=transcripts_tmp2_describe_query)[
10984                        "column_name"
10985                    ]
10986                )
10987
10988                # Create list of columns for select clause
10989                transcripts_tmp2_describe_select_clause = []
10990                for field in transcripts_tmp2_describe_list:
10991                    if field not in [
10992                        "#CHROM",
10993                        "POS",
10994                        "REF",
10995                        "ALT",
10996                        "INFO",
10997                        "transcript_mapped",
10998                    ]:
10999                        as_field = field
11000                        if field in ["transcript_original"]:
11001                            as_field = "transcripts_mapped"
11002                        transcripts_tmp2_describe_select_clause.append(
11003                            f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """
11004                        )
11005
11006                # Merge with mapping
11007                query_merge_on_transcripts = f"""
11008                    SELECT
11009                        "#CHROM", POS, REF, ALT, INFO,
11010                        CASE
11011                            WHEN ANY_VALUE(transcript_mapped) NOT IN ('')
11012                            THEN ANY_VALUE(transcript_mapped)
11013                            ELSE ANY_VALUE(transcript_original)
11014                        END AS transcript,
11015                        {", ".join(transcripts_tmp2_describe_select_clause)}
11016                    FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2}
11017                    GROUP BY "#CHROM", POS, REF, ALT, INFO,
11018                        {query_transcript_merge_group_by}
11019                """
11020
11021                # Add transcript filter from mapping file
11022                if transcript_id_mapping_force:
11023                    query_merge_on_transcripts = f"""
11024                        SELECT *
11025                        FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3}
11026                        WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe)
11027                    """
11028
11029            # No transcript mapping
11030            else:
11031
11032                # Remove transcript version
11033                if transcript_id_remove_version:
11034                    query_transcript_column = f"""
11035                        split_part({transcript_table_tmp}.transcript, '.', 1)
11036                    """
11037                else:
11038                    query_transcript_column = """
11039                        transcript
11040                    """
11041
11042                # Query sections
11043                query_transcript_column_select = (
11044                    f"{query_transcript_column} AS transcript"
11045                )
11046                query_transcript_column_group_by = query_transcript_column
11047
11048                # Query for transcripts view
11049                query_merge_on_transcripts = f"""
11050                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)}
11051                    FROM ({query_merge}) AS {transcript_table_tmp}
11052                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column}
11053                """
11054
11055            log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}")
11056
11057            # Drop transcript view is necessary
11058            if transcripts_table_drop:
11059                query_drop = f"""
11060                    DROP TABLE IF EXISTS {transcripts_table};
11061                """
11062                self.execute_query(query=query_drop)
11063
11064            # Merge and create transcript view
11065            query_create_view = f"""
11066                CREATE TABLE IF NOT EXISTS {transcripts_table}
11067                AS {query_merge_on_transcripts}
11068            """
11069            self.execute_query(query=query_create_view)
11070
11071            # Remove added columns
11072            for added_column in added_columns:
11073                self.drop_column(column=added_column)
11074
11075        else:
11076
11077            transcripts_table = None
11078
11079        return transcripts_table
11080
11081    def annotation_format_to_table(
11082        self,
11083        uniquify: bool = True,
11084        annotation_field: str = "ANN",
11085        annotation_id: str = "Feature_ID",
11086        view_name: str = "transcripts",
11087        column_rename: dict = {},
11088        column_clean: bool = False,
11089        column_case: str = None,
11090    ) -> str:
11091        """
11092        The `annotation_format_to_table` function converts annotation data from a VCF file into a
11093        structured table format, ensuring unique values and creating a temporary table for further
11094        processing or analysis.
11095
11096        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure
11097        unique values in the output or not. If set to `True`, the function will make sure that the
11098        output values are unique, defaults to True
11099        :type uniquify: bool (optional)
11100        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file
11101        that contains the annotation information for each variant. This field is used to extract the
11102        annotation details for further processing in the function. By default, it is set to "ANN",
11103        defaults to ANN
11104        :type annotation_field: str (optional)
11105        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method
11106        is used to specify the identifier for the annotation feature. This identifier will be used as a
11107        column name in the resulting table or view that is created based on the annotation data. It
11108        helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
11109        :type annotation_id: str (optional)
11110        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used
11111        to specify the name of the temporary table that will be created to store the transformed
11112        annotation data. This table will hold the extracted information from the annotation field in a
11113        structured format for further processing or analysis. By default,, defaults to transcripts
11114        :type view_name: str (optional)
11115        :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method
11116        is a dictionary that allows you to specify custom renaming for columns. By providing key-value
11117        pairs in this dictionary, you can rename specific columns in the resulting table or view that is
11118        created based on the annotation data. This feature enables
11119        :type column_rename: dict
11120        :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is
11121        a boolean flag that determines whether the annotation field should undergo a cleaning process.
11122        If set to `True`, the function will clean the annotation field before further processing. This
11123        cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults
11124        to False
11125        :type column_clean: bool (optional)
11126        :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is
11127        used to specify the case transformation to be applied to the column names extracted from the
11128        annotation data. It allows you to set the case of the column names to either lowercase or
11129        uppercase for consistency or other specific requirements during the conversion
11130        :type column_case: str
11131        :return: The function `annotation_format_to_table` is returning the name of the view created,
11132        which is stored in the variable `view_name`.
11133        """
11134
11135        # Annotation field
11136        annotation_format = "annotation_explode"
11137
11138        # Transcript annotation
11139        if column_rename:
11140            annotation_id = column_rename.get(annotation_id, annotation_id)
11141
11142        if column_clean:
11143            annotation_id = clean_annotation_field(annotation_id)
11144
11145        # Prefix
11146        prefix = self.get_explode_infos_prefix()
11147        if prefix:
11148            prefix = "INFO/"
11149
11150        # Annotation fields
11151        annotation_infos = prefix + annotation_field
11152        annotation_format_infos = prefix + annotation_format
11153
11154        # Variants table
11155        table_variants = self.get_table_variants()
11156
11157        # Header
11158        vcf_reader = self.get_header()
11159
11160        # Add columns
11161        added_columns = []
11162
11163        # Explode HGVS field in column
11164        added_columns += self.explode_infos(fields=[annotation_field])
11165
11166        if annotation_field in vcf_reader.infos:
11167
11168            # Extract ANN header
11169            ann_description = vcf_reader.infos[annotation_field].desc
11170            pattern = r"'(.+?)'"
11171            match = re.search(pattern, ann_description)
11172            if match:
11173                ann_header_match = match.group(1).split(" | ")
11174                ann_header = []
11175                ann_header_desc = {}
11176                for i in range(len(ann_header_match)):
11177                    ann_header_info = "".join(
11178                        char for char in ann_header_match[i] if char.isalnum()
11179                    )
11180                    ann_header.append(ann_header_info)
11181                    ann_header_desc[ann_header_info] = ann_header_match[i]
11182                if not ann_header_desc:
11183                    raise ValueError("Invalid header description format")
11184            else:
11185                raise ValueError("Invalid header description format")
11186
11187            # Create variant id
11188            variant_id_column = self.get_variant_id_column()
11189            added_columns += [variant_id_column]
11190
11191            # Create dataframe
11192            dataframe_annotation_format = self.get_query_to_df(
11193                f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
11194            )
11195
11196            # Create annotation columns
11197            dataframe_annotation_format[
11198                annotation_format_infos
11199            ] = dataframe_annotation_format[annotation_infos].apply(
11200                lambda x: explode_annotation_format(
11201                    annotation=str(x),
11202                    uniquify=uniquify,
11203                    output_format="JSON",
11204                    prefix="",
11205                    header=list(ann_header_desc.values()),
11206                )
11207            )
11208
11209            # Find keys
11210            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
11211            df_keys = self.get_query_to_df(query=query_json)
11212
11213            # Check keys
11214            query_json_key = []
11215            for _, row in df_keys.iterrows():
11216
11217                # Key
11218                key = row.iloc[0]
11219                key_clean = key
11220
11221                # key rename
11222                if column_rename:
11223                    key_clean = column_rename.get(key_clean, key_clean)
11224
11225                # key clean
11226                if column_clean:
11227                    key_clean = clean_annotation_field(key_clean)
11228
11229                # Key case
11230                if column_case:
11231                    if column_case.lower() in ["lower"]:
11232                        key_clean = key_clean.lower()
11233                    elif column_case.lower() in ["upper"]:
11234                        key_clean = key_clean.upper()
11235
11236                # Type
11237                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
11238
11239                # Get DataFrame from query
11240                df_json_type = self.get_query_to_df(query=query_json_type)
11241
11242                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
11243                with pd.option_context("future.no_silent_downcasting", True):
11244                    df_json_type.fillna(value="", inplace=True)
11245                    replace_dict = {None: np.nan, "": np.nan}
11246                    df_json_type.replace(replace_dict, inplace=True)
11247                    df_json_type.dropna(inplace=True)
11248
11249                # Detect column type
11250                column_type = detect_column_type(df_json_type[key_clean])
11251
11252                # Append
11253                query_json_key.append(
11254                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
11255                )
11256
11257            # Create view
11258            query_view = f"""
11259                CREATE TEMPORARY TABLE {view_name}
11260                AS (
11261                    SELECT *, {annotation_id} AS 'transcript'
11262                    FROM (
11263                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
11264                        FROM dataframe_annotation_format
11265                        )
11266                    );
11267            """
11268            self.execute_query(query=query_view)
11269
11270        else:
11271
11272            # Return None
11273            view_name = None
11274
11275        # Remove added columns
11276        for added_column in added_columns:
11277            self.drop_column(column=added_column)
11278
11279        return view_name
11280
11281    def transcript_view_to_variants(
11282        self,
11283        transcripts_table: str = None,
11284        transcripts_column_id: str = None,
11285        transcripts_info_json: str = None,
11286        transcripts_info_field_json: str = None,
11287        transcripts_info_format: str = None,
11288        transcripts_info_field_format: str = None,
11289        param: dict = {},
11290    ) -> bool:
11291        """
11292        The `transcript_view_to_variants` function updates a variants table with information from
11293        transcripts in JSON format.
11294
11295        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
11296        table containing the transcripts data. If this parameter is not provided, the function will
11297        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
11298        :type transcripts_table: str
11299        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
11300        column in the `transcripts_table` that contains the unique identifier for each transcript. This
11301        identifier is used to match transcripts with variants in the database
11302        :type transcripts_column_id: str
11303        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
11304        of the column in the variants table where the transcripts information will be stored in JSON
11305        format. This parameter allows you to define the column in the variants table that will hold the
11306        JSON-formatted information about transcripts
11307        :type transcripts_info_json: str
11308        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
11309        specify the field in the VCF header that will contain information about transcripts in JSON
11310        format. This field will be added to the VCF header as an INFO field with the specified name
11311        :type transcripts_info_field_json: str
11312        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
11313        format of the information about transcripts that will be stored in the variants table. This
11314        format can be used to define how the transcript information will be structured or displayed
11315        within the variants table
11316        :type transcripts_info_format: str
11317        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
11318        specify the field in the VCF header that will contain information about transcripts in a
11319        specific format. This field will be added to the VCF header as an INFO field with the specified
11320        name
11321        :type transcripts_info_field_format: str
11322        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
11323        that contains various configuration settings related to transcripts. It is used to provide
11324        default values for certain parameters if they are not explicitly provided when calling the
11325        method. The `param` dictionary can be passed as an argument
11326        :type param: dict
11327        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
11328        if the operation is successful and `False` if certain conditions are not met.
11329        """
11330
11331        msg_info_prefix = "Start transcripts view to variants annotations"
11332
11333        log.debug(f"{msg_info_prefix}...")
11334
11335        # Default
11336        transcripts_table_default = "transcripts"
11337        transcripts_column_id_default = "transcript"
11338        transcripts_info_json_default = None
11339        transcripts_info_format_default = None
11340        transcripts_info_field_json_default = None
11341        transcripts_info_field_format_default = None
11342
11343        # Param
11344        if not param:
11345            param = self.get_param()
11346
11347        # Transcripts table
11348        if transcripts_table is None:
11349            transcripts_table = param.get("transcripts", {}).get(
11350                "table", transcripts_table_default
11351            )
11352
11353        # Transcripts column ID
11354        if transcripts_column_id is None:
11355            transcripts_column_id = param.get("transcripts", {}).get(
11356                "column_id", transcripts_column_id_default
11357            )
11358
11359        # Transcripts info json
11360        if transcripts_info_json is None:
11361            transcripts_info_json = param.get("transcripts", {}).get(
11362                "transcripts_info_json", transcripts_info_json_default
11363            )
11364
11365        # Transcripts info field JSON
11366        if transcripts_info_field_json is None:
11367            transcripts_info_field_json = param.get("transcripts", {}).get(
11368                "transcripts_info_field_json", transcripts_info_field_json_default
11369            )
11370        # if transcripts_info_field_json is not None and transcripts_info_json is None:
11371        #     transcripts_info_json = transcripts_info_field_json
11372
11373        # Transcripts info format
11374        if transcripts_info_format is None:
11375            transcripts_info_format = param.get("transcripts", {}).get(
11376                "transcripts_info_format", transcripts_info_format_default
11377            )
11378
11379        # Transcripts info field FORMAT
11380        if transcripts_info_field_format is None:
11381            transcripts_info_field_format = param.get("transcripts", {}).get(
11382                "transcripts_info_field_format", transcripts_info_field_format_default
11383            )
11384        # if (
11385        #     transcripts_info_field_format is not None
11386        #     and transcripts_info_format is None
11387        # ):
11388        #     transcripts_info_format = transcripts_info_field_format
11389
11390        # Variants table
11391        table_variants = self.get_table_variants()
11392
11393        # Check info columns param
11394        if (
11395            transcripts_info_json is None
11396            and transcripts_info_field_json is None
11397            and transcripts_info_format is None
11398            and transcripts_info_field_format is None
11399        ):
11400            return False
11401
11402        # Transcripts infos columns
11403        query_transcripts_infos_columns = f"""
11404            SELECT *
11405            FROM (
11406                DESCRIBE SELECT * FROM {transcripts_table}
11407                )
11408            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
11409        """
11410        transcripts_infos_columns = list(
11411            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
11412        )
11413
11414        # View results
11415        clause_select = []
11416        clause_to_json = []
11417        clause_to_format = []
11418        for field in transcripts_infos_columns:
11419            # Do not consider INFO field for export into fields
11420            if field not in ["INFO"]:
11421                clause_select.append(
11422                    f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """
11423                )
11424                clause_to_json.append(f""" '{field}': "{field}" """)
11425                clause_to_format.append(f""" "{field}" """)
11426
11427        # Update
11428        update_set_json = []
11429        update_set_format = []
11430
11431        # VCF header
11432        vcf_reader = self.get_header()
11433
11434        # Transcripts to info column in JSON
11435        if transcripts_info_json:
11436
11437            # Create column on variants table
11438            self.add_column(
11439                table_name=table_variants,
11440                column_name=transcripts_info_json,
11441                column_type="JSON",
11442                default_value=None,
11443                drop=False,
11444            )
11445
11446            # Add header
11447            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
11448                transcripts_info_json,
11449                ".",
11450                "String",
11451                "Transcripts in JSON format",
11452                "unknwon",
11453                "unknwon",
11454                self.code_type_map["String"],
11455            )
11456
11457            # Add to update
11458            update_set_json.append(
11459                f""" {transcripts_info_json}=t.{transcripts_info_json} """
11460            )
11461
11462        # Transcripts to info field in JSON
11463        if transcripts_info_field_json:
11464
11465            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
11466
11467            # Add to update
11468            update_set_json.append(
11469                f""" 
11470                    INFO = concat(
11471                            CASE
11472                                WHEN INFO NOT IN ('', '.')
11473                                THEN INFO
11474                                ELSE ''
11475                            END,
11476                            CASE
11477                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
11478                                THEN concat(
11479                                    ';{transcripts_info_field_json}=',
11480                                    t.{transcripts_info_json}
11481                                )
11482                                ELSE ''
11483                            END
11484                            )
11485                """
11486            )
11487
11488            # Add header
11489            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
11490                transcripts_info_field_json,
11491                ".",
11492                "String",
11493                "Transcripts in JSON format",
11494                "unknwon",
11495                "unknwon",
11496                self.code_type_map["String"],
11497            )
11498
11499        if update_set_json:
11500
11501            # Update query
11502            query_update = f"""
11503                UPDATE {table_variants}
11504                    SET {", ".join(update_set_json)}
11505                FROM
11506                (
11507                    SELECT
11508                        "#CHROM", POS, REF, ALT,
11509                            concat(
11510                            '{{',
11511                            string_agg(
11512                                '"' || "{transcripts_column_id}" || '":' ||
11513                                to_json(json_output)
11514                            ),
11515                            '}}'
11516                            )::JSON AS {transcripts_info_json}
11517                    FROM
11518                        (
11519                        SELECT
11520                            "#CHROM", POS, REF, ALT,
11521                            "{transcripts_column_id}",
11522                            to_json(
11523                                {{{",".join(clause_to_json)}}}
11524                            )::JSON AS json_output
11525                        FROM
11526                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11527                        WHERE "{transcripts_column_id}" IS NOT NULL
11528                        )
11529                    GROUP BY "#CHROM", POS, REF, ALT
11530                ) AS t
11531                WHERE {table_variants}."#CHROM" = t."#CHROM"
11532                    AND {table_variants}."POS" = t."POS"
11533                    AND {table_variants}."REF" = t."REF"
11534                    AND {table_variants}."ALT" = t."ALT"
11535            """
11536
11537            self.execute_query(query=query_update)
11538
11539        # Transcripts to info column in FORMAT
11540        if transcripts_info_format:
11541
11542            # Create column on variants table
11543            self.add_column(
11544                table_name=table_variants,
11545                column_name=transcripts_info_format,
11546                column_type="VARCHAR",
11547                default_value=None,
11548                drop=False,
11549            )
11550
11551            # Add header
11552            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
11553                transcripts_info_format,
11554                ".",
11555                "String",
11556                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11557                "unknwon",
11558                "unknwon",
11559                self.code_type_map["String"],
11560            )
11561
11562            # Add to update
11563            update_set_format.append(
11564                f""" {transcripts_info_format}=t.{transcripts_info_format} """
11565            )
11566
11567        else:
11568
11569            # Set variable for internal queries
11570            transcripts_info_format = "transcripts_info_format"
11571
11572        # Transcripts to info field in JSON
11573        if transcripts_info_field_format:
11574
11575            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
11576
11577            # Add to update
11578            update_set_format.append(
11579                f""" 
11580                    INFO = concat(
11581                            CASE
11582                                WHEN INFO NOT IN ('', '.')
11583                                THEN INFO
11584                                ELSE ''
11585                            END,
11586                            CASE
11587                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
11588                                THEN concat(
11589                                    ';{transcripts_info_field_format}=',
11590                                    t.{transcripts_info_format}
11591                                )
11592                                ELSE ''
11593                            END
11594                            )
11595                """
11596            )
11597
11598            # Add header
11599            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
11600                transcripts_info_field_format,
11601                ".",
11602                "String",
11603                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11604                "unknwon",
11605                "unknwon",
11606                self.code_type_map["String"],
11607            )
11608
11609        if update_set_format:
11610
11611            # Update query
11612            query_update = f"""
11613                UPDATE {table_variants}
11614                    SET {", ".join(update_set_format)}
11615                FROM
11616                (
11617                    SELECT
11618                        "#CHROM", POS, REF, ALT,
11619                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
11620                    FROM 
11621                        (
11622                        SELECT
11623                            "#CHROM", POS, REF, ALT,
11624                            "{transcripts_column_id}",
11625                            concat(
11626                                "{transcripts_column_id}",
11627                                '|',
11628                                {", '|', ".join(clause_to_format)}
11629                            ) AS {transcripts_info_format}
11630                        FROM
11631                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11632                        )
11633                    GROUP BY "#CHROM", POS, REF, ALT
11634                ) AS t
11635                WHERE {table_variants}."#CHROM" = t."#CHROM"
11636                    AND {table_variants}."POS" = t."POS"
11637                    AND {table_variants}."REF" = t."REF"
11638                    AND {table_variants}."ALT" = t."ALT"
11639            """
11640
11641            self.execute_query(query=query_update)
11642
11643        return True
11644
11645    def rename_info_fields(
11646        self, fields_to_rename: dict = None, table: str = None
11647    ) -> dict:
11648        """
11649        The `rename_info_fields` function renames specified fields in a VCF file header and updates
11650        corresponding INFO fields in the variants table.
11651
11652        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the
11653        mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary
11654        represent the original field names that need to be renamed, and the corresponding values
11655        represent the new names to which the fields should be
11656        :type fields_to_rename: dict
11657        :param table: The `table` parameter in the `rename_info_fields` function represents the name of
11658        the table in which the variants data is stored. This table contains information about genetic
11659        variants, and the function updates the corresponding INFO fields in this table when renaming
11660        specified fields in the VCF file header
11661        :type table: str
11662        :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains
11663        the original field names as keys and their corresponding new names (or None if the field was
11664        removed) as values after renaming or removing specified fields in a VCF file header and updating
11665        corresponding INFO fields in the variants table.
11666        """
11667
11668        # Init
11669        fields_renamed = {}
11670        config = self.get_config()
11671        access = config.get("access")
11672
11673        if table is None:
11674            table = self.get_table_variants()
11675
11676        if fields_to_rename is not None and access not in ["RO"]:
11677
11678            log.info("Rename or remove fields...")
11679
11680            # Header
11681            header = self.get_header()
11682
11683            for field_to_rename, field_renamed in fields_to_rename.items():
11684
11685                if field_to_rename in header.infos:
11686
11687                    # Rename header
11688                    if field_renamed is not None:
11689                        header.infos[field_renamed] = vcf.parser._Info(
11690                            field_renamed,
11691                            header.infos[field_to_rename].num,
11692                            header.infos[field_to_rename].type,
11693                            header.infos[field_to_rename].desc,
11694                            header.infos[field_to_rename].source,
11695                            header.infos[field_to_rename].version,
11696                            header.infos[field_to_rename].type_code,
11697                        )
11698                    del header.infos[field_to_rename]
11699
11700                    # Rename INFO patterns
11701                    field_pattern = rf'(^|;)({field_to_rename})=([^;]*)'
11702                    if field_renamed is not None:
11703                        field_renamed_pattern = rf'\1{field_renamed}=\3'
11704                    else:
11705                        field_renamed_pattern = ''
11706
11707                    # Rename INFO
11708                    query = f"""
11709                        UPDATE {table}
11710                        SET
11711                            INFO = regexp_replace(INFO, '{field_pattern}', '{field_renamed_pattern}', 'g')
11712                    """
11713                    self.execute_query(query=query)
11714
11715                    # Return
11716                    fields_renamed[field_to_rename] = field_renamed
11717
11718                    # Log
11719                    if field_renamed is not None:
11720                        log.info(f"Rename or remove fields: field '{field_to_rename}' renamed to '{field_renamed}'")
11721                    else:
11722                        log.info(f"Rename or remove fields: field '{field_to_rename}' removed")
11723
11724        return fields_renamed
11725
11726    def calculation_rename_info_fields(
11727        self,
11728        fields_to_rename: dict = None,
11729        table: str = None,
11730        operation_name: str = "RENAME_INFO_FIELDS",
11731    ) -> None:
11732        """
11733        The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates
11734        fields to rename and table if provided, and then calls another function to rename the fields.
11735
11736        :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be
11737        renamed in a table. Each key-value pair in the dictionary represents the original field name as
11738        the key and the new field name as the value
11739        :type fields_to_rename: dict
11740        :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to
11741        specify the name of the table for which the fields are to be renamed. It is a string type
11742        parameter
11743        :type table: str
11744        :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields`
11745        method is a string that specifies the name of the operation being performed. In this context, it
11746        is used as a default value for the operation name if not explicitly provided when calling the
11747        function, defaults to RENAME_INFO_FIELDS
11748        :type operation_name: str (optional)
11749        """
11750
11751        # Param
11752        param = self.get_param()
11753
11754        # Get param fields to rename
11755        param_fields_to_rename = (
11756            param.get("calculation", {})
11757            .get("calculations", {})
11758            .get(operation_name, {})
11759            .get("fields_to_rename", None)
11760        )
11761
11762        # Get param table
11763        param_table = (
11764            param.get("calculation", {})
11765            .get("calculations", {})
11766            .get(operation_name, {})
11767            .get("table", None)
11768        )
11769
11770        # Init fields_to_rename
11771        if fields_to_rename is None:
11772            fields_to_rename = param_fields_to_rename
11773
11774        # Init table
11775        if table is None:
11776            table = param_table
11777
11778        renamed_fields = self.rename_info_fields(
11779            fields_to_rename=fields_to_rename, table=table
11780        )
11781
11782        log.debug(f"renamed_fields:{renamed_fields}")
class Variants:
   36class Variants:
   37
   38    def __init__(
   39        self,
   40        conn=None,
   41        input: str = None,
   42        output: str = None,
   43        config: dict = {},
   44        param: dict = {},
   45        load: bool = False,
   46    ) -> None:
   47        """
   48        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
   49        header
   50
   51        :param conn: the connection to the database
   52        :param input: the input file
   53        :param output: the output file
   54        :param config: a dictionary containing the configuration of the model
   55        :param param: a dictionary containing the parameters of the model
   56        """
   57
   58        # Init variables
   59        self.init_variables()
   60
   61        # Input
   62        self.set_input(input)
   63
   64        # Config
   65        self.set_config(config)
   66
   67        # Param
   68        self.set_param(param)
   69
   70        # Output
   71        self.set_output(output)
   72
   73        # connexion
   74        self.set_connexion(conn)
   75
   76        # Header
   77        self.set_header()
   78
   79        # Samples
   80        self.set_samples()
   81
   82        # Load data
   83        if load:
   84            self.load_data()
   85
   86    def set_samples(self, samples: list = None) -> list:
   87        """
   88        The function `set_samples` sets the samples attribute of an object to a provided list or
   89        retrieves it from a parameter dictionary.
   90
   91        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
   92        input and sets the `samples` attribute of the class to the provided list. If no samples are
   93        provided, it tries to get the samples from the class's parameters using the `get_param` method
   94        :type samples: list
   95        :return: The `samples` list is being returned.
   96        """
   97
   98        if not samples:
   99            samples = self.get_param().get("samples", {}).get("list", None)
  100
  101        self.samples = samples
  102
  103        return samples
  104
  105    def get_samples(self) -> list:
  106        """
  107        This function returns a list of samples.
  108        :return: The `get_samples` method is returning the `samples` attribute of the object.
  109        """
  110
  111        return self.samples
  112
  113    def get_samples_check(self) -> bool:
  114        """
  115        This function returns the value of the "check" key within the "samples" dictionary retrieved
  116        from the parameters.
  117        :return: The method `get_samples_check` is returning the value of the key "check" inside the
  118        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
  119        method. If the key "check" is not found, it will return `False`.
  120        """
  121
  122        return self.get_param().get("samples", {}).get("check", True)
  123
  124    def set_input(self, input: str = None) -> None:
  125        """
  126        The function `set_input` takes a file name as input, extracts the name and extension, and sets
  127        attributes in the class accordingly.
  128
  129        :param input: The `set_input` method in the provided code snippet is used to set attributes
  130        related to the input file. Here's a breakdown of the parameters and their usage in the method:
  131        :type input: str
  132        """
  133
  134        if input and not isinstance(input, str):
  135            try:
  136                self.input = input.name
  137            except:
  138                log.error(f"Input file '{input} in bad format")
  139                raise ValueError(f"Input file '{input} in bad format")
  140        else:
  141            self.input = input
  142
  143        # Input format
  144        if input:
  145            input_name, input_extension = os.path.splitext(self.input)
  146            self.input_name = input_name
  147            self.input_extension = input_extension
  148            self.input_format = self.input_extension.replace(".", "")
  149
  150    def set_config(self, config: dict) -> None:
  151        """
  152        The set_config function takes a config object and assigns it as the configuration object for the
  153        class.
  154
  155        :param config: The `config` parameter in the `set_config` function is a dictionary object that
  156        contains configuration settings for the class. When you call the `set_config` function with a
  157        dictionary object as the argument, it will set that dictionary as the configuration object for
  158        the class
  159        :type config: dict
  160        """
  161
  162        self.config = config
  163
  164    def set_param(self, param: dict) -> None:
  165        """
  166        This function sets a parameter object for the class based on the input dictionary.
  167
  168        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
  169        as the `param` attribute of the class instance
  170        :type param: dict
  171        """
  172
  173        self.param = param
  174
  175    def init_variables(self) -> None:
  176        """
  177        This function initializes the variables that will be used in the rest of the class
  178        """
  179
  180        self.prefix = "howard"
  181        self.table_variants = "variants"
  182        self.dataframe = None
  183
  184        self.comparison_map = {
  185            "gt": ">",
  186            "gte": ">=",
  187            "lt": "<",
  188            "lte": "<=",
  189            "equals": "=",
  190            "contains": "SIMILAR TO",
  191        }
  192
  193        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
  194
  195        self.code_type_map_to_sql = {
  196            "Integer": "INTEGER",
  197            "String": "VARCHAR",
  198            "Float": "FLOAT",
  199            "Flag": "VARCHAR",
  200        }
  201
  202        self.index_additionnal_fields = []
  203
  204    def get_indexing(self) -> bool:
  205        """
  206        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
  207        returns False.
  208        :return: The value of the indexing parameter.
  209        """
  210
  211        return self.get_param().get("indexing", False)
  212
  213    def get_connexion_config(self) -> dict:
  214        """
  215        The function `get_connexion_config` returns a dictionary containing the configuration for a
  216        connection, including the number of threads and memory limit.
  217        :return: a dictionary containing the configuration for the Connexion library.
  218        """
  219
  220        # config
  221        config = self.get_config()
  222
  223        # Connexion config
  224        connexion_config = {}
  225        threads = self.get_threads()
  226
  227        # Threads
  228        if threads:
  229            connexion_config["threads"] = threads
  230
  231        # Memory
  232        # if config.get("memory", None):
  233        #     connexion_config["memory_limit"] = config.get("memory")
  234        if self.get_memory():
  235            connexion_config["memory_limit"] = self.get_memory()
  236
  237        # Temporary directory
  238        if config.get("tmp", None):
  239            connexion_config["temp_directory"] = config.get("tmp")
  240
  241        # Access
  242        if config.get("access", None):
  243            access = config.get("access")
  244            if access in ["RO"]:
  245                access = "READ_ONLY"
  246            elif access in ["RW"]:
  247                access = "READ_WRITE"
  248            connexion_db = self.get_connexion_db()
  249            if connexion_db in ":memory:":
  250                access = "READ_WRITE"
  251            connexion_config["access_mode"] = access
  252
  253        return connexion_config
  254
  255    def get_duckdb_settings(self) -> dict:
  256        """
  257        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
  258        string.
  259        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
  260        """
  261
  262        # config
  263        config = self.get_config()
  264
  265        # duckdb settings
  266        duckdb_settings_dict = {}
  267        if config.get("duckdb_settings", None):
  268            duckdb_settings = config.get("duckdb_settings")
  269            duckdb_settings = full_path(duckdb_settings)
  270            # duckdb setting is a file
  271            if os.path.exists(duckdb_settings):
  272                with open(duckdb_settings) as json_file:
  273                    duckdb_settings_dict = yaml.safe_load(json_file)
  274            # duckdb settings is a string
  275            else:
  276                duckdb_settings_dict = json.loads(duckdb_settings)
  277
  278        return duckdb_settings_dict
  279
  280    def set_connexion_db(self) -> str:
  281        """
  282        The function `set_connexion_db` returns the appropriate database connection string based on the
  283        input format and connection type.
  284        :return: the value of the variable `connexion_db`.
  285        """
  286
  287        # Default connexion db
  288        default_connexion_db = ":memory:"
  289
  290        # Find connexion db
  291        if self.get_input_format() in ["db", "duckdb"]:
  292            connexion_db = self.get_input()
  293        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
  294            connexion_db = default_connexion_db
  295        elif self.get_connexion_type() in ["tmpfile"]:
  296            tmp_name = tempfile.mkdtemp(
  297                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
  298            )
  299            connexion_db = f"{tmp_name}/tmp.db"
  300        elif self.get_connexion_type() != "":
  301            connexion_db = self.get_connexion_type()
  302        else:
  303            connexion_db = default_connexion_db
  304
  305        # Set connexion db
  306        self.connexion_db = connexion_db
  307
  308        return connexion_db
  309
  310    def set_connexion(self, conn) -> None:
  311        """
  312        The function `set_connexion` creates a connection to a database, with options for different
  313        database formats and settings.
  314
  315        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
  316        database. If a connection is not provided, a new connection to an in-memory database is created.
  317        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
  318        sqlite
  319        """
  320
  321        # Connexion db
  322        connexion_db = self.set_connexion_db()
  323
  324        # Connexion config
  325        connexion_config = self.get_connexion_config()
  326
  327        # Connexion format
  328        connexion_format = self.get_config().get("connexion_format", "duckdb")
  329        # Set connexion format
  330        self.connexion_format = connexion_format
  331
  332        # Connexion
  333        if not conn:
  334            if connexion_format in ["duckdb"]:
  335                conn = duckdb.connect(connexion_db, config=connexion_config)
  336                # duckDB settings
  337                duckdb_settings = self.get_duckdb_settings()
  338                if duckdb_settings:
  339                    for setting in duckdb_settings:
  340                        setting_value = duckdb_settings.get(setting)
  341                        if isinstance(setting_value, str):
  342                            setting_value = f"'{setting_value}'"
  343                        conn.execute(f"PRAGMA {setting}={setting_value};")
  344            elif connexion_format in ["sqlite"]:
  345                conn = sqlite3.connect(connexion_db)
  346
  347        # Set connexion
  348        self.conn = conn
  349
  350        # Log
  351        log.debug(f"connexion_format: {connexion_format}")
  352        log.debug(f"connexion_db: {connexion_db}")
  353        log.debug(f"connexion config: {connexion_config}")
  354        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
  355
  356    def set_output(self, output: str = None) -> None:
  357        """
  358        The `set_output` function in Python sets the output file based on the input or a specified key
  359        in the config file, extracting the output name, extension, and format.
  360
  361        :param output: The `output` parameter in the `set_output` method is used to specify the name of
  362        the output file. If the config file has an 'output' key, the method sets the output to the value
  363        of that key. If no output is provided, it sets the output to `None`
  364        :type output: str
  365        """
  366
  367        if output and not isinstance(output, str):
  368            self.output = output.name
  369        else:
  370            self.output = output
  371
  372        # Output format
  373        if self.output:
  374            output_name, output_extension = os.path.splitext(self.output)
  375            self.output_name = output_name
  376            self.output_extension = output_extension
  377            self.output_format = self.output_extension.replace(".", "")
  378        else:
  379            self.output_name = None
  380            self.output_extension = None
  381            self.output_format = None
  382
  383    def set_header(self) -> None:
  384        """
  385        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
  386        """
  387
  388        input_file = self.get_input()
  389        default_header_list = [
  390            "##fileformat=VCFv4.2",
  391            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
  392        ]
  393
  394        # Full path
  395        input_file = full_path(input_file)
  396
  397        if input_file:
  398
  399            input_format = self.get_input_format()
  400            input_compressed = self.get_input_compressed()
  401            config = self.get_config()
  402            header_list = default_header_list
  403            if input_format in [
  404                "vcf",
  405                "hdr",
  406                "tsv",
  407                "csv",
  408                "psv",
  409                "parquet",
  410                "db",
  411                "duckdb",
  412            ]:
  413                # header provided in param
  414                if config.get("header_file", None):
  415                    with open(config.get("header_file"), "rt") as f:
  416                        header_list = self.read_vcf_header(f)
  417                # within a vcf file format (header within input file itsself)
  418                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
  419                    # within a compressed vcf file format (.vcf.gz)
  420                    if input_compressed:
  421                        with bgzf.open(input_file, "rt") as f:
  422                            header_list = self.read_vcf_header(f)
  423                    # within an uncompressed vcf file format (.vcf)
  424                    else:
  425                        with open(input_file, "rt") as f:
  426                            header_list = self.read_vcf_header(f)
  427                # header provided in default external file .hdr
  428                elif os.path.exists((input_file + ".hdr")):
  429                    with open(input_file + ".hdr", "rt") as f:
  430                        header_list = self.read_vcf_header(f)
  431                else:
  432                    try:  # Try to get header info fields and file columns
  433
  434                        with tempfile.TemporaryDirectory() as tmpdir:
  435
  436                            # Create database
  437                            db_for_header = Database(database=input_file)
  438
  439                            # Get header columns for infos fields
  440                            db_header_from_columns = (
  441                                db_for_header.get_header_from_columns()
  442                            )
  443
  444                            # Get real columns in the file
  445                            db_header_columns = db_for_header.get_columns()
  446
  447                            # Write header file
  448                            header_file_tmp = os.path.join(tmpdir, "header")
  449                            f = open(header_file_tmp, "w")
  450                            vcf.Writer(f, db_header_from_columns)
  451                            f.close()
  452
  453                            # Replace #CHROM line with rel columns
  454                            header_list = db_for_header.read_header_file(
  455                                header_file=header_file_tmp
  456                            )
  457                            header_list[-1] = "\t".join(db_header_columns)
  458
  459                    except:
  460
  461                        log.warning(
  462                            f"No header for file {input_file}. Set as default VCF header"
  463                        )
  464                        header_list = default_header_list
  465
  466            else:  # try for unknown format ?
  467
  468                log.error(f"Input file format '{input_format}' not available")
  469                raise ValueError(f"Input file format '{input_format}' not available")
  470
  471            if not header_list:
  472                header_list = default_header_list
  473
  474            # header as list
  475            self.header_list = header_list
  476
  477            # header as VCF object
  478            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
  479
  480        else:
  481
  482            self.header_list = None
  483            self.header_vcf = None
  484
  485    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
  486        """
  487        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
  488        DataFrame based on the connection format.
  489
  490        :param query: The `query` parameter in the `get_query_to_df` function is a string that
  491        represents the SQL query you want to execute. This query will be used to fetch data from a
  492        database and convert it into a pandas DataFrame
  493        :type query: str
  494        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
  495        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
  496        function will only fetch up to that number of rows from the database query result. If no limit
  497        is specified,
  498        :type limit: int
  499        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
  500        """
  501
  502        # Connexion format
  503        connexion_format = self.get_connexion_format()
  504
  505        # Limit in query
  506        if limit:
  507            pd.set_option("display.max_rows", limit)
  508            if connexion_format in ["duckdb"]:
  509                df = (
  510                    self.conn.execute(query)
  511                    .fetch_record_batch(limit)
  512                    .read_next_batch()
  513                    .to_pandas()
  514                )
  515            elif connexion_format in ["sqlite"]:
  516                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
  517
  518        # Full query
  519        else:
  520            if connexion_format in ["duckdb"]:
  521                df = self.conn.execute(query).df()
  522            elif connexion_format in ["sqlite"]:
  523                df = pd.read_sql_query(query, self.conn)
  524
  525        return df
  526
  527    def get_overview(self) -> None:
  528        """
  529        The function prints the input, output, config, and dataframe of the current object
  530        """
  531        table_variants_from = self.get_table_variants(clause="from")
  532        sql_columns = self.get_header_columns_as_sql()
  533        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
  534        df = self.get_query_to_df(sql_query_export)
  535        log.info(
  536            "Input:  "
  537            + str(self.get_input())
  538            + " ["
  539            + str(str(self.get_input_format()))
  540            + "]"
  541        )
  542        log.info(
  543            "Output: "
  544            + str(self.get_output())
  545            + " ["
  546            + str(str(self.get_output_format()))
  547            + "]"
  548        )
  549        log.info("Config: ")
  550        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
  551            "\n"
  552        ):
  553            log.info("\t" + str(d))
  554        log.info("Param: ")
  555        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
  556            "\n"
  557        ):
  558            log.info("\t" + str(d))
  559        log.info("Sample list: " + str(self.get_header_sample_list()))
  560        log.info("Dataframe: ")
  561        for d in str(df).split("\n"):
  562            log.info("\t" + str(d))
  563
  564        # garbage collector
  565        del df
  566        gc.collect()
  567
  568        return None
  569
  570    def get_stats(self) -> dict:
  571        """
  572        The `get_stats` function calculates and returns various statistics of the current object,
  573        including information about the input file, variants, samples, header fields, quality, and
  574        SNVs/InDels.
  575        :return: a dictionary containing various statistics of the current object. The dictionary has
  576        the following structure:
  577        """
  578
  579        # Log
  580        log.info(f"Stats Calculation...")
  581
  582        # table varaints
  583        table_variants_from = self.get_table_variants()
  584
  585        # stats dict
  586        stats = {"Infos": {}}
  587
  588        ### File
  589        input_file = self.get_input()
  590        stats["Infos"]["Input file"] = input_file
  591
  592        # Header
  593        header_infos = self.get_header().infos
  594        header_formats = self.get_header().formats
  595        header_infos_list = list(header_infos)
  596        header_formats_list = list(header_formats)
  597
  598        ### Variants
  599
  600        stats["Variants"] = {}
  601
  602        # Variants by chr
  603        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
  604        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
  605        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
  606            by=["CHROM"], kind="quicksort"
  607        )
  608
  609        # Total number of variants
  610        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
  611
  612        # Calculate percentage
  613        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
  614            lambda x: (x / nb_of_variants)
  615        )
  616
  617        stats["Variants"]["Number of variants by chromosome"] = (
  618            nb_of_variants_by_chrom.to_dict(orient="index")
  619        )
  620
  621        stats["Infos"]["Number of variants"] = int(nb_of_variants)
  622
  623        ### Samples
  624
  625        # Init
  626        samples = {}
  627        nb_of_samples = 0
  628
  629        # Check Samples
  630        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
  631            log.debug(f"Check samples...")
  632            for sample in self.get_header_sample_list():
  633                sql_query_samples = f"""
  634                    SELECT  '{sample}' as sample,
  635                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
  636                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
  637                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
  638                    FROM {table_variants_from}
  639                    WHERE (
  640                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
  641                        AND
  642                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
  643                      )
  644                    GROUP BY genotype
  645                    """
  646                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
  647                sample_genotype_count = sql_query_genotype_df["count"].sum()
  648                if len(sql_query_genotype_df):
  649                    nb_of_samples += 1
  650                    samples[f"{sample} - {sample_genotype_count} variants"] = (
  651                        sql_query_genotype_df.to_dict(orient="index")
  652                    )
  653
  654            stats["Samples"] = samples
  655            stats["Infos"]["Number of samples"] = nb_of_samples
  656
  657        # #
  658        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
  659        #     stats["Infos"]["Number of samples"] = nb_of_samples
  660        # elif nb_of_samples:
  661        #     stats["Infos"]["Number of samples"] = "not a VCF format"
  662
  663        ### INFO and FORMAT fields
  664        header_types_df = {}
  665        header_types_list = {
  666            "List of INFO fields": header_infos,
  667            "List of FORMAT fields": header_formats,
  668        }
  669        i = 0
  670        for header_type in header_types_list:
  671
  672            header_type_infos = header_types_list.get(header_type)
  673            header_infos_dict = {}
  674
  675            for info in header_type_infos:
  676
  677                i += 1
  678                header_infos_dict[i] = {}
  679
  680                # ID
  681                header_infos_dict[i]["id"] = info
  682
  683                # num
  684                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
  685                if header_type_infos[info].num in genotype_map.keys():
  686                    header_infos_dict[i]["Number"] = genotype_map.get(
  687                        header_type_infos[info].num
  688                    )
  689                else:
  690                    header_infos_dict[i]["Number"] = header_type_infos[info].num
  691
  692                # type
  693                if header_type_infos[info].type:
  694                    header_infos_dict[i]["Type"] = header_type_infos[info].type
  695                else:
  696                    header_infos_dict[i]["Type"] = "."
  697
  698                # desc
  699                if header_type_infos[info].desc != None:
  700                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
  701                else:
  702                    header_infos_dict[i]["Description"] = ""
  703
  704            if len(header_infos_dict):
  705                header_types_df[header_type] = pd.DataFrame.from_dict(
  706                    header_infos_dict, orient="index"
  707                ).to_dict(orient="index")
  708
  709        # Stats
  710        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
  711        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
  712        stats["Header"] = header_types_df
  713
  714        ### QUAL
  715        if "QUAL" in self.get_header_columns():
  716            sql_query_qual = f"""
  717                    SELECT
  718                        avg(CAST(QUAL AS INTEGER)) AS Average,
  719                        min(CAST(QUAL AS INTEGER)) AS Minimum,
  720                        max(CAST(QUAL AS INTEGER)) AS Maximum,
  721                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
  722                        median(CAST(QUAL AS INTEGER)) AS Median,
  723                        variance(CAST(QUAL AS INTEGER)) AS Variance
  724                    FROM {table_variants_from}
  725                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
  726                    """
  727
  728            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
  729            stats["Quality"] = {"Stats": qual}
  730
  731        ### SNV and InDel
  732
  733        sql_query_snv = f"""
  734            
  735            SELECT Type, count FROM (
  736
  737                    SELECT
  738                        'Total' AS Type,
  739                        count(*) AS count
  740                    FROM {table_variants_from}
  741
  742                    UNION
  743
  744                    SELECT
  745                        'MNV' AS Type,
  746                        count(*) AS count
  747                    FROM {table_variants_from}
  748                    WHERE len(REF) > 1 AND len(ALT) > 1
  749                    AND len(REF) = len(ALT)
  750
  751                    UNION
  752
  753                    SELECT
  754                        'InDel' AS Type,
  755                        count(*) AS count
  756                    FROM {table_variants_from}
  757                    WHERE len(REF) > 1 OR len(ALT) > 1
  758                    AND len(REF) != len(ALT)
  759                    
  760                    UNION
  761
  762                    SELECT
  763                        'SNV' AS Type,
  764                        count(*) AS count
  765                    FROM {table_variants_from}
  766                    WHERE len(REF) = 1 AND len(ALT) = 1
  767
  768                )
  769
  770            ORDER BY count DESC
  771
  772                """
  773        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
  774
  775        sql_query_snv_substitution = f"""
  776                SELECT
  777                    concat(REF, '>', ALT) AS 'Substitution',
  778                    count(*) AS count
  779                FROM {table_variants_from}
  780                WHERE len(REF) = 1 AND len(ALT) = 1
  781                GROUP BY REF, ALT
  782                ORDER BY count(*) DESC
  783                """
  784        snv_substitution = (
  785            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
  786        )
  787        stats["Variants"]["Counts"] = snv_indel
  788        stats["Variants"]["Substitutions"] = snv_substitution
  789
  790        return stats
  791
  792    def stats_to_file(self, file: str = None) -> str:
  793        """
  794        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
  795        into a JSON object, and writes the JSON object to the specified file.
  796
  797        :param file: The `file` parameter is a string that represents the file path where the JSON data
  798        will be written
  799        :type file: str
  800        :return: the name of the file that was written to.
  801        """
  802
  803        # Get stats
  804        stats = self.get_stats()
  805
  806        # Serializing json
  807        json_object = json.dumps(stats, indent=4)
  808
  809        # Writing to sample.json
  810        with open(file, "w") as outfile:
  811            outfile.write(json_object)
  812
  813        return file
  814
  815    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
  816        """
  817        The `print_stats` function generates a markdown file and prints the statistics contained in a
  818        JSON file in a formatted manner.
  819
  820        :param output_file: The `output_file` parameter is a string that specifies the path and filename
  821        of the output file where the stats will be printed in Markdown format. If no `output_file` is
  822        provided, a temporary directory will be created and the stats will be saved in a file named
  823        "stats.md" within that
  824        :type output_file: str
  825        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
  826        file where the statistics will be saved. If no value is provided, a temporary directory will be
  827        created and a default file name "stats.json" will be used
  828        :type json_file: str
  829        :return: The function `print_stats` does not return any value. It has a return type annotation
  830        of `None`.
  831        """
  832
  833        # Full path
  834        output_file = full_path(output_file)
  835        json_file = full_path(json_file)
  836
  837        with tempfile.TemporaryDirectory() as tmpdir:
  838
  839            # Files
  840            if not output_file:
  841                output_file = os.path.join(tmpdir, "stats.md")
  842            if not json_file:
  843                json_file = os.path.join(tmpdir, "stats.json")
  844
  845            # Create folders
  846            if not os.path.exists(os.path.dirname(output_file)):
  847                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
  848            if not os.path.exists(os.path.dirname(json_file)):
  849                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
  850
  851            # Create stats JSON file
  852            stats_file = self.stats_to_file(file=json_file)
  853
  854            # Print stats file
  855            with open(stats_file) as f:
  856                stats = yaml.safe_load(f)
  857
  858            # Output
  859            output_title = []
  860            output_index = []
  861            output = []
  862
  863            # Title
  864            output_title.append("# HOWARD Stats")
  865
  866            # Index
  867            output_index.append("## Index")
  868
  869            # Process sections
  870            for section in stats:
  871                infos = stats.get(section)
  872                section_link = "#" + section.lower().replace(" ", "-")
  873                output.append(f"## {section}")
  874                output_index.append(f"- [{section}]({section_link})")
  875
  876                if len(infos):
  877                    for info in infos:
  878                        try:
  879                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
  880                            is_df = True
  881                        except:
  882                            try:
  883                                df = pd.DataFrame.from_dict(
  884                                    json.loads((infos.get(info))), orient="index"
  885                                )
  886                                is_df = True
  887                            except:
  888                                is_df = False
  889                        if is_df:
  890                            output.append(f"### {info}")
  891                            info_link = "#" + info.lower().replace(" ", "-")
  892                            output_index.append(f"   - [{info}]({info_link})")
  893                            output.append(f"{df.to_markdown(index=False)}")
  894                        else:
  895                            output.append(f"- {info}: {infos.get(info)}")
  896                else:
  897                    output.append(f"NA")
  898
  899            # Write stats in markdown file
  900            with open(output_file, "w") as fp:
  901                for item in output_title:
  902                    fp.write("%s\n" % item)
  903                for item in output_index:
  904                    fp.write("%s\n" % item)
  905                for item in output:
  906                    fp.write("%s\n" % item)
  907
  908            # Output stats in markdown
  909            print("")
  910            print("\n\n".join(output_title))
  911            print("")
  912            print("\n\n".join(output))
  913            print("")
  914
  915        return None
  916
  917    def get_input(self) -> str:
  918        """
  919        It returns the value of the input variable.
  920        :return: The input is being returned.
  921        """
  922        return self.input
  923
  924    def get_input_format(self, input_file: str = None) -> str:
  925        """
  926        This function returns the format of the input variable, either from the provided input file or
  927        by prompting for input.
  928
  929        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
  930        represents the file path of the input file. If no `input_file` is provided when calling the
  931        method, it will default to `None`
  932        :type input_file: str
  933        :return: The format of the input variable is being returned.
  934        """
  935
  936        if not input_file:
  937            input_file = self.get_input()
  938        input_format = get_file_format(input_file)
  939        return input_format
  940
  941    def get_input_compressed(self, input_file: str = None) -> str:
  942        """
  943        The function `get_input_compressed` returns the format of the input variable after compressing
  944        it.
  945
  946        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
  947        that represents the file path of the input file. If no `input_file` is provided when calling the
  948        method, it will default to `None` and the method will then call `self.get_input()` to
  949        :type input_file: str
  950        :return: The function `get_input_compressed` returns the compressed format of the input
  951        variable.
  952        """
  953
  954        if not input_file:
  955            input_file = self.get_input()
  956        input_compressed = get_file_compressed(input_file)
  957        return input_compressed
  958
  959    def get_output(self) -> str:
  960        """
  961        It returns the output of the neuron.
  962        :return: The output of the neural network.
  963        """
  964
  965        return self.output
  966
  967    def get_output_format(self, output_file: str = None) -> str:
  968        """
  969        The function `get_output_format` returns the format of the input variable or the output file if
  970        provided.
  971
  972        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
  973        that represents the file path of the output file. If no `output_file` is provided when calling
  974        the method, it will default to the output obtained from the `get_output` method of the class
  975        instance. The
  976        :type output_file: str
  977        :return: The format of the input variable is being returned.
  978        """
  979
  980        if not output_file:
  981            output_file = self.get_output()
  982        output_format = get_file_format(output_file)
  983
  984        return output_format
  985
  986    def get_config(self) -> dict:
  987        """
  988        It returns the config
  989        :return: The config variable is being returned.
  990        """
  991        return self.config
  992
  993    def get_param(self) -> dict:
  994        """
  995        It returns the param
  996        :return: The param variable is being returned.
  997        """
  998        return self.param
  999
 1000    def get_connexion_db(self) -> str:
 1001        """
 1002        It returns the connexion_db attribute of the object
 1003        :return: The connexion_db is being returned.
 1004        """
 1005        return self.connexion_db
 1006
 1007    def get_prefix(self) -> str:
 1008        """
 1009        It returns the prefix of the object.
 1010        :return: The prefix is being returned.
 1011        """
 1012        return self.prefix
 1013
 1014    def get_table_variants(self, clause: str = "select") -> str:
 1015        """
 1016        This function returns the table_variants attribute of the object
 1017
 1018        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
 1019        defaults to select (optional)
 1020        :return: The table_variants attribute of the object.
 1021        """
 1022
 1023        # Access
 1024        access = self.get_config().get("access", None)
 1025
 1026        # Clauses "select", "where", "update"
 1027        if clause in ["select", "where", "update"]:
 1028            table_variants = self.table_variants
 1029        # Clause "from"
 1030        elif clause in ["from"]:
 1031            # For Read Only
 1032            if self.get_input_format() in ["parquet"] and access in ["RO"]:
 1033                input_file = self.get_input()
 1034                table_variants = f"'{input_file}' as variants"
 1035            # For Read Write
 1036            else:
 1037                table_variants = f"{self.table_variants} as variants"
 1038        else:
 1039            table_variants = self.table_variants
 1040        return table_variants
 1041
 1042    def get_tmp_dir(self) -> str:
 1043        """
 1044        The function `get_tmp_dir` returns the temporary directory path based on configuration
 1045        parameters or a default path.
 1046        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
 1047        configuration, parameters, and a default value of "/tmp".
 1048        """
 1049
 1050        return get_tmp(
 1051            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
 1052        )
 1053
 1054    def get_connexion_type(self) -> str:
 1055        """
 1056        If the connexion type is not in the list of allowed connexion types, raise a ValueError
 1057
 1058        :return: The connexion type is being returned.
 1059        """
 1060        return self.get_config().get("connexion_type", "memory")
 1061
 1062    def get_connexion(self):
 1063        """
 1064        It returns the connection object
 1065
 1066        :return: The connection object.
 1067        """
 1068        return self.conn
 1069
 1070    def close_connexion(self) -> None:
 1071        """
 1072        This function closes the connection to the database.
 1073        :return: The connection is being closed.
 1074        """
 1075        return self.conn.close()
 1076
 1077    def get_header(self, type: str = "vcf"):
 1078        """
 1079        This function returns the header of the VCF file as a list of strings
 1080
 1081        :param type: the type of header you want to get, defaults to vcf (optional)
 1082        :return: The header of the vcf file.
 1083        """
 1084
 1085        if self.header_vcf:
 1086            if type == "vcf":
 1087                return self.header_vcf
 1088            elif type == "list":
 1089                return self.header_list
 1090        else:
 1091            if type == "vcf":
 1092                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 1093                return header
 1094            elif type == "list":
 1095                return vcf_required
 1096
 1097    def get_header_infos_list(self) -> list:
 1098        """
 1099        This function retrieves a list of information fields from the header.
 1100        :return: A list of information fields from the header.
 1101        """
 1102
 1103        # Init
 1104        infos_list = []
 1105
 1106        for field in self.get_header().infos:
 1107            infos_list.append(field)
 1108
 1109        return infos_list
 1110
 1111    def get_header_length(self, file: str = None) -> int:
 1112        """
 1113        The function `get_header_length` returns the length of the header list, excluding the #CHROM
 1114        line.
 1115
 1116        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
 1117        header file. If this argument is provided, the function will read the header from the specified
 1118        file and return the length of the header list minus 1 (to exclude the #CHROM line)
 1119        :type file: str
 1120        :return: the length of the header list, excluding the #CHROM line.
 1121        """
 1122
 1123        if file:
 1124            return len(self.read_vcf_header_file(file=file)) - 1
 1125        elif self.get_header(type="list"):
 1126            return len(self.get_header(type="list")) - 1
 1127        else:
 1128            return 0
 1129
 1130    def get_header_columns(self) -> str:
 1131        """
 1132        This function returns the header list of a VCF
 1133
 1134        :return: The length of the header list.
 1135        """
 1136        if self.get_header():
 1137            return self.get_header(type="list")[-1]
 1138        else:
 1139            return ""
 1140
 1141    def get_header_columns_as_list(self) -> list:
 1142        """
 1143        This function returns the header list of a VCF
 1144
 1145        :return: The length of the header list.
 1146        """
 1147        if self.get_header():
 1148            return self.get_header_columns().strip().split("\t")
 1149        else:
 1150            return []
 1151
 1152    def get_header_columns_as_sql(self) -> str:
 1153        """
 1154        This function retruns header length (without #CHROM line)
 1155
 1156        :return: The length of the header list.
 1157        """
 1158        sql_column_list = []
 1159        for col in self.get_header_columns_as_list():
 1160            sql_column_list.append(f'"{col}"')
 1161        return ",".join(sql_column_list)
 1162
 1163    def get_header_sample_list(
 1164        self, check: bool = False, samples: list = None, samples_force: bool = False
 1165    ) -> list:
 1166        """
 1167        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
 1168        checking and filtering based on input parameters.
 1169
 1170        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
 1171        parameter that determines whether to check if the samples in the list are properly defined as
 1172        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
 1173        list is defined as a, defaults to False
 1174        :type check: bool (optional)
 1175        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
 1176        allows you to specify a subset of samples from the header. If you provide a list of sample
 1177        names, the function will check if each sample is defined in the header. If a sample is not found
 1178        in the
 1179        :type samples: list
 1180        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
 1181        a boolean parameter that determines whether to force the function to return the sample list
 1182        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
 1183        function will return the sample list without performing, defaults to False
 1184        :type samples_force: bool (optional)
 1185        :return: The function `get_header_sample_list` returns a list of samples based on the input
 1186        parameters and conditions specified in the function.
 1187        """
 1188
 1189        # Init
 1190        samples_list = []
 1191
 1192        if samples is None:
 1193            samples_list = self.header_vcf.samples
 1194        else:
 1195            samples_checked = []
 1196            for sample in samples:
 1197                if sample in self.header_vcf.samples:
 1198                    samples_checked.append(sample)
 1199                else:
 1200                    log.warning(f"Sample '{sample}' not defined in header")
 1201            samples_list = samples_checked
 1202
 1203            # Force sample list without checking if is_genotype_column
 1204            if samples_force:
 1205                log.warning(f"Samples {samples_list} not checked if genotypes")
 1206                return samples_list
 1207
 1208        if check:
 1209            samples_checked = []
 1210            for sample in samples_list:
 1211                if self.is_genotype_column(column=sample):
 1212                    samples_checked.append(sample)
 1213                else:
 1214                    log.warning(
 1215                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
 1216                    )
 1217            samples_list = samples_checked
 1218
 1219        # Return samples list
 1220        return samples_list
 1221
 1222    def is_genotype_column(self, column: str = None) -> bool:
 1223        """
 1224        This function checks if a given column is a genotype column in a database.
 1225
 1226        :param column: The `column` parameter in the `is_genotype_column` method is a string that
 1227        represents the column name in a database table. This method checks if the specified column is a
 1228        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
 1229        method of
 1230        :type column: str
 1231        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
 1232        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
 1233        column name and returns the result. If the `column` parameter is None, it returns False.
 1234        """
 1235
 1236        if column is not None:
 1237            return Database(database=self.get_input()).is_genotype_column(column=column)
 1238        else:
 1239            return False
 1240
 1241    def get_verbose(self) -> bool:
 1242        """
 1243        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
 1244        exist
 1245
 1246        :return: The value of the key "verbose" in the config dictionary.
 1247        """
 1248        return self.get_config().get("verbose", False)
 1249
 1250    def get_connexion_format(self) -> str:
 1251        """
 1252        It returns the connexion format of the object.
 1253        :return: The connexion_format is being returned.
 1254        """
 1255        connexion_format = self.connexion_format
 1256        if connexion_format not in ["duckdb", "sqlite"]:
 1257            log.error(f"Unknown connexion format {connexion_format}")
 1258            raise ValueError(f"Unknown connexion format {connexion_format}")
 1259        else:
 1260            return connexion_format
 1261
 1262    def insert_file_to_table(
 1263        self,
 1264        file,
 1265        columns: str,
 1266        header_len: int = 0,
 1267        sep: str = "\t",
 1268        chunksize: int = 1000000,
 1269    ) -> None:
 1270        """
 1271        The function reads a file in chunks and inserts each chunk into a table based on the specified
 1272        database format.
 1273
 1274        :param file: The `file` parameter is the file that you want to load into a table. It should be
 1275        the path to the file on your system
 1276        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
 1277        should contain the names of the columns in the table where the data will be inserted. The column
 1278        names should be separated by commas within the string. For example, if you have columns named
 1279        "id", "name
 1280        :type columns: str
 1281        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
 1282        the number of lines to skip at the beginning of the file before reading the actual data. This
 1283        parameter allows you to skip any header information present in the file before processing the
 1284        data, defaults to 0
 1285        :type header_len: int (optional)
 1286        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
 1287        separator character that is used in the file being read. In this case, the default separator is
 1288        set to `\t`, which represents a tab character. You can change this parameter to a different
 1289        separator character if, defaults to \t
 1290        :type sep: str (optional)
 1291        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
 1292        when processing the file in chunks. In the provided code snippet, the default value for
 1293        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
 1294        to 1000000
 1295        :type chunksize: int (optional)
 1296        """
 1297
 1298        # Config
 1299        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
 1300        connexion_format = self.get_connexion_format()
 1301
 1302        log.debug("chunksize: " + str(chunksize))
 1303
 1304        if chunksize:
 1305            for chunk in pd.read_csv(
 1306                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
 1307            ):
 1308                if connexion_format in ["duckdb"]:
 1309                    sql_insert_into = (
 1310                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
 1311                    )
 1312                    self.conn.execute(sql_insert_into)
 1313                elif connexion_format in ["sqlite"]:
 1314                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
 1315
 1316    def load_data(
 1317        self,
 1318        input_file: str = None,
 1319        drop_variants_table: bool = False,
 1320        sample_size: int = 20480,
 1321    ) -> None:
 1322        """
 1323        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
 1324        table before loading the data and specify a sample size.
 1325
 1326        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
 1327        table
 1328        :type input_file: str
 1329        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
 1330        determines whether the variants table should be dropped before loading the data. If set to
 1331        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
 1332        not be dropped, defaults to False
 1333        :type drop_variants_table: bool (optional)
 1334        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
 1335        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
 1336        20480
 1337        :type sample_size: int (optional)
 1338        """
 1339
 1340        log.info("Loading...")
 1341
 1342        # change input file
 1343        if input_file:
 1344            self.set_input(input_file)
 1345            self.set_header()
 1346
 1347        # drop variants table
 1348        if drop_variants_table:
 1349            self.drop_variants_table()
 1350
 1351        # get table variants
 1352        table_variants = self.get_table_variants()
 1353
 1354        # Access
 1355        access = self.get_config().get("access", None)
 1356        log.debug(f"access: {access}")
 1357
 1358        # Input format and compress
 1359        input_format = self.get_input_format()
 1360        input_compressed = self.get_input_compressed()
 1361        log.debug(f"input_format: {input_format}")
 1362        log.debug(f"input_compressed: {input_compressed}")
 1363
 1364        # input_compressed_format
 1365        if input_compressed:
 1366            input_compressed_format = "gzip"
 1367        else:
 1368            input_compressed_format = "none"
 1369        log.debug(f"input_compressed_format: {input_compressed_format}")
 1370
 1371        # Connexion format
 1372        connexion_format = self.get_connexion_format()
 1373
 1374        # Sample size
 1375        if not sample_size:
 1376            sample_size = -1
 1377        log.debug(f"sample_size: {sample_size}")
 1378
 1379        # Load data
 1380        log.debug(f"Load Data from {input_format}")
 1381
 1382        # DuckDB connexion
 1383        if connexion_format in ["duckdb"]:
 1384
 1385            # Database already exists
 1386            if self.input_format in ["db", "duckdb"]:
 1387
 1388                if connexion_format in ["duckdb"]:
 1389                    log.debug(f"Input file format '{self.input_format}' duckDB")
 1390                else:
 1391                    log.error(
 1392                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1393                    )
 1394                    raise ValueError(
 1395                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1396                    )
 1397
 1398            # Load from existing database format
 1399            else:
 1400
 1401                try:
 1402                    # Create Table or View
 1403                    database = Database(database=self.input)
 1404                    sql_from = database.get_sql_from(sample_size=sample_size)
 1405
 1406                    if access in ["RO"]:
 1407                        sql_load = (
 1408                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
 1409                        )
 1410                    else:
 1411                        sql_load = (
 1412                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
 1413                        )
 1414                    self.conn.execute(sql_load)
 1415
 1416                except:
 1417                    # Format not available
 1418                    log.error(f"Input file format '{self.input_format}' not available")
 1419                    raise ValueError(
 1420                        f"Input file format '{self.input_format}' not available"
 1421                    )
 1422
 1423        # SQLite connexion
 1424        elif connexion_format in ["sqlite"] and input_format in [
 1425            "vcf",
 1426            "tsv",
 1427            "csv",
 1428            "psv",
 1429        ]:
 1430
 1431            # Main structure
 1432            structure = {
 1433                "#CHROM": "VARCHAR",
 1434                "POS": "INTEGER",
 1435                "ID": "VARCHAR",
 1436                "REF": "VARCHAR",
 1437                "ALT": "VARCHAR",
 1438                "QUAL": "VARCHAR",
 1439                "FILTER": "VARCHAR",
 1440                "INFO": "VARCHAR",
 1441            }
 1442
 1443            # Strcuture with samples
 1444            structure_complete = structure
 1445            if self.get_header_sample_list():
 1446                structure["FORMAT"] = "VARCHAR"
 1447                for sample in self.get_header_sample_list():
 1448                    structure_complete[sample] = "VARCHAR"
 1449
 1450            # Columns list for create and insert
 1451            sql_create_table_columns = []
 1452            sql_create_table_columns_list = []
 1453            for column in structure_complete:
 1454                column_type = structure_complete[column]
 1455                sql_create_table_columns.append(
 1456                    f'"{column}" {column_type} default NULL'
 1457                )
 1458                sql_create_table_columns_list.append(f'"{column}"')
 1459
 1460            # Create database
 1461            log.debug(f"Create Table {table_variants}")
 1462            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
 1463            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
 1464            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
 1465            self.conn.execute(sql_create_table)
 1466
 1467            # chunksize define length of file chunk load file
 1468            chunksize = 100000
 1469
 1470            # delimiter
 1471            delimiter = file_format_delimiters.get(input_format, "\t")
 1472
 1473            # Load the input file
 1474            with open(self.input, "rt") as input_file:
 1475
 1476                # Use the appropriate file handler based on the input format
 1477                if input_compressed:
 1478                    input_file = bgzf.open(self.input, "rt")
 1479                if input_format in ["vcf"]:
 1480                    header_len = self.get_header_length()
 1481                else:
 1482                    header_len = 0
 1483
 1484                # Insert the file contents into a table
 1485                self.insert_file_to_table(
 1486                    input_file,
 1487                    columns=sql_create_table_columns_list_sql,
 1488                    header_len=header_len,
 1489                    sep=delimiter,
 1490                    chunksize=chunksize,
 1491                )
 1492
 1493        else:
 1494            log.error(
 1495                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1496            )
 1497            raise ValueError(
 1498                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1499            )
 1500
 1501        # Explode INFOS fields into table fields
 1502        if self.get_explode_infos():
 1503            self.explode_infos(
 1504                prefix=self.get_explode_infos_prefix(),
 1505                fields=self.get_explode_infos_fields(),
 1506                force=True,
 1507            )
 1508
 1509        # Create index after insertion
 1510        self.create_indexes()
 1511
 1512    def get_explode_infos(self) -> bool:
 1513        """
 1514        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
 1515        to False if it is not set.
 1516        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
 1517        value. If the parameter is not present, it will return False.
 1518        """
 1519
 1520        return self.get_param().get("explode", {}).get("explode_infos", False)
 1521
 1522    def get_explode_infos_fields(
 1523        self,
 1524        explode_infos_fields: str = None,
 1525        remove_fields_not_in_header: bool = False,
 1526    ) -> list:
 1527        """
 1528        The `get_explode_infos_fields` function returns a list of exploded information fields based on
 1529        the input parameter `explode_infos_fields`.
 1530
 1531        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
 1532        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
 1533        comma-separated list of field names to explode
 1534        :type explode_infos_fields: str
 1535        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
 1536        flag that determines whether to remove fields that are not present in the header. If it is set
 1537        to `True`, any field that is not in the header will be excluded from the list of exploded
 1538        information fields. If it is set to `, defaults to False
 1539        :type remove_fields_not_in_header: bool (optional)
 1540        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
 1541        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
 1542        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
 1543        Otherwise, it returns a list of exploded information fields after removing any spaces and
 1544        splitting the string by commas.
 1545        """
 1546
 1547        # If no fields, get it in param
 1548        if not explode_infos_fields:
 1549            explode_infos_fields = (
 1550                self.get_param().get("explode", {}).get("explode_infos_fields", None)
 1551            )
 1552
 1553        # If no fields, defined as all fields in header using keyword
 1554        if not explode_infos_fields:
 1555            explode_infos_fields = "*"
 1556
 1557        # If fields list not empty
 1558        if explode_infos_fields:
 1559
 1560            # Input fields list
 1561            if isinstance(explode_infos_fields, str):
 1562                fields_input = explode_infos_fields.split(",")
 1563            elif isinstance(explode_infos_fields, list):
 1564                fields_input = explode_infos_fields
 1565            else:
 1566                fields_input = []
 1567
 1568            # Fields list without * keyword
 1569            fields_without_all = fields_input.copy()
 1570            if "*".casefold() in (item.casefold() for item in fields_without_all):
 1571                fields_without_all.remove("*")
 1572
 1573            # Fields in header
 1574            fields_in_header = sorted(list(set(self.get_header().infos)))
 1575
 1576            # Construct list of fields
 1577            fields_output = []
 1578            for field in fields_input:
 1579
 1580                # Strip field
 1581                field = field.strip()
 1582
 1583                # format keyword * in regex
 1584                if field.upper() in ["*"]:
 1585                    field = ".*"
 1586
 1587                # Find all fields with pattern
 1588                r = re.compile(field)
 1589                fields_search = sorted(list(filter(r.match, fields_in_header)))
 1590
 1591                # Remove fields input from search
 1592                if field in fields_search:
 1593                    fields_search = [field]
 1594                elif fields_search != [field]:
 1595                    fields_search = sorted(
 1596                        list(set(fields_search).difference(fields_input))
 1597                    )
 1598
 1599                # If field is not in header (avoid not well formatted header)
 1600                if not fields_search and not remove_fields_not_in_header:
 1601                    fields_search = [field]
 1602
 1603                # Add found fields
 1604                for new_field in fields_search:
 1605                    # Add field, if not already exists, and if it is in header (if asked)
 1606                    if (
 1607                        new_field not in fields_output
 1608                        and (
 1609                            not remove_fields_not_in_header
 1610                            or new_field in fields_in_header
 1611                        )
 1612                        and new_field not in [".*"]
 1613                    ):
 1614                        fields_output.append(new_field)
 1615
 1616            return fields_output
 1617
 1618        else:
 1619
 1620            return []
 1621
 1622    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
 1623        """
 1624        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
 1625        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
 1626        not provided.
 1627
 1628        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
 1629        prefix to be used for exploding or expanding information
 1630        :type explode_infos_prefix: str
 1631        :return: the value of the variable `explode_infos_prefix`.
 1632        """
 1633
 1634        if not explode_infos_prefix:
 1635            explode_infos_prefix = (
 1636                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
 1637            )
 1638
 1639        return explode_infos_prefix
 1640
 1641    def add_column(
 1642        self,
 1643        table_name,
 1644        column_name,
 1645        column_type,
 1646        default_value=None,
 1647        drop: bool = False,
 1648    ) -> dict:
 1649        """
 1650        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
 1651        doesn't already exist.
 1652
 1653        :param table_name: The name of the table to which you want to add a column
 1654        :param column_name: The parameter "column_name" is the name of the column that you want to add
 1655        to the table
 1656        :param column_type: The `column_type` parameter specifies the data type of the column that you
 1657        want to add to the table. It should be a string that represents the desired data type, such as
 1658        "INTEGER", "TEXT", "REAL", etc
 1659        :param default_value: The `default_value` parameter is an optional parameter that specifies the
 1660        default value for the newly added column. If a default value is provided, it will be assigned to
 1661        the column for any existing rows that do not have a value for that column
 1662        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
 1663        if it already exists in the table. If `drop` is set to `True`, the function will drop the
 1664        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
 1665        to False
 1666        :type drop: bool (optional)
 1667        :return: a boolean value indicating whether the column was successfully added to the table.
 1668        """
 1669
 1670        # added
 1671        added = False
 1672        dropped = False
 1673
 1674        # Check if the column already exists in the table
 1675        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1676        columns = self.get_query_to_df(query).columns.tolist()
 1677        if column_name.upper() in [c.upper() for c in columns]:
 1678            log.debug(
 1679                f"The {column_name} column already exists in the {table_name} table"
 1680            )
 1681            if drop:
 1682                self.drop_column(table_name=table_name, column_name=column_name)
 1683                dropped = True
 1684            else:
 1685                return None
 1686        else:
 1687            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1688
 1689        # Add column in table
 1690        add_column_query = (
 1691            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
 1692        )
 1693        if default_value is not None:
 1694            add_column_query += f" DEFAULT {default_value}"
 1695        self.execute_query(add_column_query)
 1696        added = not dropped
 1697        log.debug(
 1698            f"The {column_name} column was successfully added to the {table_name} table"
 1699        )
 1700
 1701        if added:
 1702            added_column = {
 1703                "table_name": table_name,
 1704                "column_name": column_name,
 1705                "column_type": column_type,
 1706                "default_value": default_value,
 1707            }
 1708        else:
 1709            added_column = None
 1710
 1711        return added_column
 1712
 1713    def drop_column(
 1714        self, column: dict = None, table_name: str = None, column_name: str = None
 1715    ) -> bool:
 1716        """
 1717        The `drop_column` function drops a specified column from a given table in a database and returns
 1718        True if the column was successfully dropped, and False if the column does not exist in the
 1719        table.
 1720
 1721        :param column: The `column` parameter is a dictionary that contains information about the column
 1722        you want to drop. It has two keys:
 1723        :type column: dict
 1724        :param table_name: The `table_name` parameter is the name of the table from which you want to
 1725        drop a column
 1726        :type table_name: str
 1727        :param column_name: The `column_name` parameter is the name of the column that you want to drop
 1728        from the table
 1729        :type column_name: str
 1730        :return: a boolean value. It returns True if the column was successfully dropped from the table,
 1731        and False if the column does not exist in the table.
 1732        """
 1733
 1734        # Find column infos
 1735        if column:
 1736            if isinstance(column, dict):
 1737                table_name = column.get("table_name", None)
 1738                column_name = column.get("column_name", None)
 1739            elif isinstance(column, str):
 1740                table_name = self.get_table_variants()
 1741                column_name = column
 1742            else:
 1743                table_name = None
 1744                column_name = None
 1745
 1746        if not table_name and not column_name:
 1747            return False
 1748
 1749        # Removed
 1750        removed = False
 1751
 1752        # Check if the column already exists in the table
 1753        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1754        columns = self.get_query_to_df(query).columns.tolist()
 1755        if column_name in columns:
 1756            log.debug(f"The {column_name} column exists in the {table_name} table")
 1757        else:
 1758            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1759            return False
 1760
 1761        # Add column in table # ALTER TABLE integers DROP k
 1762        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
 1763        self.execute_query(add_column_query)
 1764        removed = True
 1765        log.debug(
 1766            f"The {column_name} column was successfully dropped to the {table_name} table"
 1767        )
 1768
 1769        return removed
 1770
 1771    def explode_infos(
 1772        self,
 1773        prefix: str = None,
 1774        create_index: bool = False,
 1775        fields: list = None,
 1776        force: bool = False,
 1777        proccess_all_fields_together: bool = False,
 1778        table: str = None,
 1779    ) -> list:
 1780        """
 1781        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
 1782        individual columns, returning a list of added columns.
 1783
 1784        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
 1785        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
 1786        `self.get_explode_infos_prefix()` as the prefix
 1787        :type prefix: str
 1788        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
 1789        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
 1790        `False`, indexes will not be created. The default value is `False`, defaults to False
 1791        :type create_index: bool (optional)
 1792        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
 1793        that you want to explode into individual columns. If this parameter is not provided, all INFO
 1794        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
 1795        a list to the `
 1796        :type fields: list
 1797        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
 1798        determines whether to drop and recreate a column if it already exists in the table. If `force`
 1799        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
 1800        defaults to False
 1801        :type force: bool (optional)
 1802        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
 1803        flag that determines whether to process all the INFO fields together or individually. If set to
 1804        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
 1805        be processed individually. The default value is, defaults to False
 1806        :type proccess_all_fields_together: bool (optional)
 1807        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
 1808        of the table where the exploded INFO fields will be added as individual columns. If you provide
 1809        a value for the `table` parameter, the function will use that table name. If the `table`
 1810        parameter is
 1811        :type table: str
 1812        :return: The `explode_infos` function returns a list of added columns.
 1813        """
 1814
 1815        # drop indexes
 1816        self.drop_indexes()
 1817
 1818        # connexion format
 1819        connexion_format = self.get_connexion_format()
 1820
 1821        # Access
 1822        access = self.get_config().get("access", None)
 1823
 1824        # Added columns
 1825        added_columns = []
 1826
 1827        if access not in ["RO"]:
 1828
 1829            # prefix
 1830            if prefix in [None, True] or not isinstance(prefix, str):
 1831                if self.get_explode_infos_prefix() not in [None, True]:
 1832                    prefix = self.get_explode_infos_prefix()
 1833                else:
 1834                    prefix = "INFO/"
 1835
 1836            # table variants
 1837            if table is not None:
 1838                table_variants = table
 1839            else:
 1840                table_variants = self.get_table_variants(clause="select")
 1841
 1842            # extra infos
 1843            try:
 1844                extra_infos = self.get_extra_infos()
 1845            except:
 1846                extra_infos = []
 1847
 1848            # Header infos
 1849            header_infos = self.get_header().infos
 1850
 1851            log.debug(
 1852                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
 1853            )
 1854
 1855            sql_info_alter_table_array = []
 1856
 1857            # Info fields to check
 1858            fields_list = list(header_infos)
 1859            if fields:
 1860                fields_list += fields
 1861            fields_list = set(fields_list)
 1862
 1863            # If no fields
 1864            if not fields:
 1865                fields = []
 1866
 1867            # Translate fields if patterns
 1868            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
 1869
 1870            for info in fields:
 1871
 1872                info_id_sql = prefix + info
 1873
 1874                if (
 1875                    info in fields_list
 1876                    or prefix + info in fields_list
 1877                    or info in extra_infos
 1878                ):
 1879
 1880                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
 1881
 1882                    if info in header_infos:
 1883                        info_type = header_infos[info].type
 1884                        info_num = header_infos[info].num
 1885                    else:
 1886                        info_type = "String"
 1887                        info_num = 0
 1888
 1889                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
 1890                    if info_num != 1:
 1891                        type_sql = "VARCHAR"
 1892
 1893                    # Add field
 1894                    added_column = self.add_column(
 1895                        table_name=table_variants,
 1896                        column_name=info_id_sql,
 1897                        column_type=type_sql,
 1898                        default_value="null",
 1899                        drop=force,
 1900                    )
 1901
 1902                    if added_column:
 1903                        added_columns.append(added_column)
 1904
 1905                    if added_column or force:
 1906
 1907                        # add field to index
 1908                        self.index_additionnal_fields.append(info_id_sql)
 1909
 1910                        # Update field array
 1911                        if connexion_format in ["duckdb"]:
 1912                            update_info_field = f"""
 1913                            "{info_id_sql}" =
 1914                                CASE
 1915                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
 1916                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
 1917                                END
 1918                            """
 1919                        elif connexion_format in ["sqlite"]:
 1920                            update_info_field = f"""
 1921                                "{info_id_sql}" =
 1922                                    CASE
 1923                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
 1924                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
 1925                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
 1926                                    END
 1927                            """
 1928
 1929                        sql_info_alter_table_array.append(update_info_field)
 1930
 1931            if sql_info_alter_table_array:
 1932
 1933                # By chromosomes
 1934                try:
 1935                    chromosomes_list = list(
 1936                        self.get_query_to_df(
 1937                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
 1938                        )["#CHROM"]
 1939                    )
 1940                except:
 1941                    chromosomes_list = [None]
 1942
 1943                for chrom in chromosomes_list:
 1944                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
 1945
 1946                    # Where clause
 1947                    where_clause = ""
 1948                    if chrom and len(chromosomes_list) > 1:
 1949                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
 1950
 1951                    # Update table
 1952                    if proccess_all_fields_together:
 1953                        sql_info_alter_table_array_join = ", ".join(
 1954                            sql_info_alter_table_array
 1955                        )
 1956                        if sql_info_alter_table_array_join:
 1957                            sql_info_alter_table = f"""
 1958                                UPDATE {table_variants}
 1959                                SET {sql_info_alter_table_array_join}
 1960                                {where_clause}
 1961                                """
 1962                            log.debug(
 1963                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
 1964                            )
 1965                            # log.debug(sql_info_alter_table)
 1966                            self.conn.execute(sql_info_alter_table)
 1967                    else:
 1968                        sql_info_alter_num = 0
 1969                        for sql_info_alter in sql_info_alter_table_array:
 1970                            sql_info_alter_num += 1
 1971                            sql_info_alter_table = f"""
 1972                                UPDATE {table_variants}
 1973                                SET {sql_info_alter}
 1974                                {where_clause}
 1975                                """
 1976                            log.debug(
 1977                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
 1978                            )
 1979                            # log.debug(sql_info_alter_table)
 1980                            self.conn.execute(sql_info_alter_table)
 1981
 1982        # create indexes
 1983        if create_index:
 1984            self.create_indexes()
 1985
 1986        return added_columns
 1987
 1988    def create_indexes(self) -> None:
 1989        """
 1990        Create indexes on the table after insertion
 1991        """
 1992
 1993        # Access
 1994        access = self.get_config().get("access", None)
 1995
 1996        # get table variants
 1997        table_variants = self.get_table_variants("FROM")
 1998
 1999        if self.get_indexing() and access not in ["RO"]:
 2000            # Create index
 2001            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
 2002            self.conn.execute(sql_create_table_index)
 2003            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
 2004            self.conn.execute(sql_create_table_index)
 2005            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
 2006            self.conn.execute(sql_create_table_index)
 2007            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
 2008            self.conn.execute(sql_create_table_index)
 2009            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
 2010            self.conn.execute(sql_create_table_index)
 2011            for field in self.index_additionnal_fields:
 2012                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
 2013                self.conn.execute(sql_create_table_index)
 2014
 2015    def drop_indexes(self) -> None:
 2016        """
 2017        Create indexes on the table after insertion
 2018        """
 2019
 2020        # Access
 2021        access = self.get_config().get("access", None)
 2022
 2023        # get table variants
 2024        table_variants = self.get_table_variants("FROM")
 2025
 2026        # Get database format
 2027        connexion_format = self.get_connexion_format()
 2028
 2029        if access not in ["RO"]:
 2030            if connexion_format in ["duckdb"]:
 2031                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
 2032            elif connexion_format in ["sqlite"]:
 2033                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
 2034
 2035            list_indexes = self.conn.execute(sql_list_indexes)
 2036            index_names = [row[0] for row in list_indexes.fetchall()]
 2037            for index in index_names:
 2038                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
 2039                self.conn.execute(sql_drop_table_index)
 2040
 2041    def read_vcf_header(self, f) -> list:
 2042        """
 2043        It reads the header of a VCF file and returns a list of the header lines
 2044
 2045        :param f: the file object
 2046        :return: The header lines of the VCF file.
 2047        """
 2048
 2049        header_list = []
 2050        for line in f:
 2051            header_list.append(line)
 2052            if line.startswith("#CHROM"):
 2053                break
 2054        return header_list
 2055
 2056    def read_vcf_header_file(self, file: str = None) -> list:
 2057        """
 2058        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
 2059        uncompressed files.
 2060
 2061        :param file: The `file` parameter is a string that represents the path to the VCF header file
 2062        that you want to read. It is an optional parameter, so if you don't provide a value, it will
 2063        default to `None`
 2064        :type file: str
 2065        :return: The function `read_vcf_header_file` returns a list.
 2066        """
 2067
 2068        if self.get_input_compressed(input_file=file):
 2069            with bgzf.open(file, "rt") as f:
 2070                return self.read_vcf_header(f=f)
 2071        else:
 2072            with open(file, "rt") as f:
 2073                return self.read_vcf_header(f=f)
 2074
 2075    def execute_query(self, query: str):
 2076        """
 2077        It takes a query as an argument, executes it, and returns the results
 2078
 2079        :param query: The query to be executed
 2080        :return: The result of the query is being returned.
 2081        """
 2082        if query:
 2083            return self.conn.execute(query)  # .fetchall()
 2084        else:
 2085            return None
 2086
 2087    def export_output(
 2088        self,
 2089        output_file: str | None = None,
 2090        output_header: str | None = None,
 2091        export_header: bool = True,
 2092        query: str | None = None,
 2093        parquet_partitions: list | None = None,
 2094        chunk_size: int | None = None,
 2095        threads: int | None = None,
 2096        sort: bool = False,
 2097        index: bool = False,
 2098        order_by: str | None = None,
 2099        fields_to_rename: dict | None = None
 2100    ) -> bool:
 2101        """
 2102        The `export_output` function exports data from a VCF file to various formats, including VCF,
 2103        CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and
 2104        partitioning.
 2105        
 2106        :param output_file: The `output_file` parameter is a string that specifies the name of the
 2107        output file where the exported data will be saved
 2108        :type output_file: str | None
 2109        :param output_header: The `output_header` parameter is a string that specifies the name of the
 2110        file where the header of the VCF file will be exported. If this parameter is not provided, the
 2111        header will be exported to a file with the same name as the `output_file` parameter, but with
 2112        the extension "
 2113        :type output_header: str | None
 2114        :param export_header: The `export_header` parameter is a boolean flag that determines whether
 2115        the header of a VCF file should be exported to a separate file or not. If `export_header` is
 2116        True, the header will be exported to a file. If `export_header` is False, the header will not
 2117        be, defaults to True
 2118        :type export_header: bool (optional)
 2119        :param query: The `query` parameter in the `export_output` function is an optional SQL query
 2120        that can be used to filter and select specific data from the VCF file before exporting it. If
 2121        provided, only the data that matches the query will be exported. This allows you to customize
 2122        the exported data based on
 2123        :type query: str | None
 2124        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
 2125        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
 2126        organize data in a hierarchical directory structure based on the values of one or more columns.
 2127        This can improve query performance when working with large datasets
 2128        :type parquet_partitions: list | None
 2129        :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when
 2130        exporting data in Parquet format. This parameter is used for partitioning the Parquet file into
 2131        multiple files. It helps in optimizing the export process by breaking down the data into
 2132        manageable chunks for processing and storage
 2133        :type chunk_size: int | None
 2134        :param threads: The `threads` parameter in the `export_output` function specifies the number of
 2135        threads to be used during the export process. It determines the level of parallelism and can
 2136        improve the performance of the export operation. If this parameter is not provided, the function
 2137        will use the default number of threads
 2138        :type threads: int | None
 2139        :param sort: The `sort` parameter in the `export_output` function is a boolean flag that
 2140        determines whether the output file should be sorted based on genomic coordinates of the
 2141        variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to
 2142        `False`,, defaults to False
 2143        :type sort: bool (optional)
 2144        :param index: The `index` parameter in the `export_output` function is a boolean flag that
 2145        determines whether an index should be created on the output file. If `index` is set to `True`,
 2146        an index will be created on the output file. If `index` is set to `False`, no, defaults to False
 2147        :type index: bool (optional)
 2148        :param order_by: The `order_by` parameter in the `export_output` function is a string that
 2149        specifies the column(s) to use for sorting the output file. This parameter is only applicable
 2150        when exporting data in VCF format. It allows you to specify the column(s) based on which the
 2151        output file should be
 2152        :type order_by: str | None
 2153        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the
 2154        mapping of field names to be renamed during the export process. This parameter allows you to
 2155        customize the output field names before exporting the data. Each key-value pair in the
 2156        dictionary represents the original field name as the key and the new field name
 2157        :type fields_to_rename: dict | None
 2158        :return: The `export_output` function returns a boolean value. It checks if the output file
 2159        exists and returns True if it does, or None if it doesn't.
 2160        """
 2161
 2162        # Log
 2163        log.info("Exporting...")
 2164
 2165        # Full path
 2166        output_file = full_path(output_file)
 2167        output_header = full_path(output_header)
 2168
 2169        # Config
 2170        config = self.get_config()
 2171
 2172        # Param
 2173        param = self.get_param()
 2174
 2175        # Tmp files to remove
 2176        tmp_to_remove = []
 2177
 2178        # If no output, get it
 2179        if not output_file:
 2180            output_file = self.get_output()
 2181
 2182        # If not threads
 2183        if not threads:
 2184            threads = self.get_threads()
 2185
 2186        # Rename fields
 2187        if not fields_to_rename:
 2188            fields_to_rename = param.get("export", {}).get("fields_to_rename", None)
 2189        self.rename_info_fields(fields_to_rename=fields_to_rename)
 2190
 2191        # Auto header name with extension
 2192        if export_header or output_header:
 2193            if not output_header:
 2194                output_header = f"{output_file}.hdr"
 2195            # Export header
 2196            self.export_header(output_file=output_file)
 2197
 2198        # Switch off export header if VCF output
 2199        output_file_type = get_file_format(output_file)
 2200        if output_file_type in ["vcf"]:
 2201            export_header = False
 2202            tmp_to_remove.append(output_header)
 2203
 2204        # Chunk size
 2205        if not chunk_size:
 2206            chunk_size = config.get("chunk_size", None)
 2207
 2208        # Parquet partition
 2209        if not parquet_partitions:
 2210            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
 2211        if parquet_partitions and isinstance(parquet_partitions, str):
 2212            parquet_partitions = parquet_partitions.split(",")
 2213
 2214        # Order by
 2215        if not order_by:
 2216            order_by = param.get("export", {}).get("order_by", "")
 2217
 2218        # Header in output
 2219        header_in_output = param.get("export", {}).get("include_header", False)
 2220
 2221        # Database
 2222        database_source = self.get_connexion()
 2223
 2224        # Connexion format
 2225        connexion_format = self.get_connexion_format()
 2226
 2227        # Explode infos
 2228        if self.get_explode_infos():
 2229            self.explode_infos(
 2230                prefix=self.get_explode_infos_prefix(),
 2231                fields=self.get_explode_infos_fields(),
 2232                force=False,
 2233            )
 2234
 2235        # if connexion_format in ["sqlite"] or query:
 2236        if connexion_format in ["sqlite"]:
 2237
 2238            # Export in Parquet
 2239            random_tmp = "".join(
 2240                random.choice(string.ascii_lowercase) for i in range(10)
 2241            )
 2242            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
 2243            tmp_to_remove.append(database_source)
 2244
 2245            # Table Variants
 2246            table_variants = self.get_table_variants()
 2247
 2248            # Create export query
 2249            sql_query_export_subquery = f"""
 2250                SELECT * FROM {table_variants}
 2251                """
 2252
 2253            # Write source file
 2254            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
 2255
 2256        # Create database
 2257        database = Database(
 2258            database=database_source,
 2259            table="variants",
 2260            header_file=output_header,
 2261            conn_config=self.get_connexion_config(),
 2262        )
 2263
 2264        # Existing colomns header
 2265        existing_columns_header = database.get_header_columns_from_database(query=query)
 2266
 2267        # Sample list
 2268        if output_file_type in ["vcf"]:
 2269            get_samples = self.get_samples()
 2270            get_samples_check = self.get_samples_check()
 2271            samples_force = get_samples is not None
 2272            sample_list = self.get_header_sample_list(
 2273                check=get_samples_check,
 2274                samples=get_samples,
 2275                samples_force=samples_force,
 2276            )
 2277        else:
 2278            sample_list = None
 2279
 2280        # Export file
 2281        database.export(
 2282            output_database=output_file,
 2283            output_header=output_header,
 2284            existing_columns_header=existing_columns_header,
 2285            parquet_partitions=parquet_partitions,
 2286            chunk_size=chunk_size,
 2287            threads=threads,
 2288            sort=sort,
 2289            index=index,
 2290            header_in_output=header_in_output,
 2291            order_by=order_by,
 2292            query=query,
 2293            export_header=export_header,
 2294            sample_list=sample_list,
 2295        )
 2296
 2297        # Remove
 2298        remove_if_exists(tmp_to_remove)
 2299
 2300        return (os.path.exists(output_file) or None) and (
 2301            os.path.exists(output_file) or None
 2302        )
 2303
 2304    def get_extra_infos(self, table: str = None) -> list:
 2305        """
 2306        The `get_extra_infos` function returns a list of columns that are in a specified table but not
 2307        in the header.
 2308
 2309        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
 2310        name of the table from which you want to retrieve the extra columns that are not present in the
 2311        header. If the `table` parameter is not provided when calling the function, it will default to
 2312        using the variants
 2313        :type table: str
 2314        :return: A list of columns that are in the specified table but not in the header of the table.
 2315        """
 2316
 2317        header_columns = []
 2318
 2319        if not table:
 2320            table = self.get_table_variants(clause="from")
 2321            header_columns = self.get_header_columns()
 2322
 2323        # Check all columns in the database
 2324        query = f""" SELECT * FROM {table} LIMIT 1 """
 2325        log.debug(f"query {query}")
 2326        table_columns = self.get_query_to_df(query).columns.tolist()
 2327        extra_columns = []
 2328
 2329        # Construct extra infos (not in header)
 2330        for column in table_columns:
 2331            if column not in header_columns:
 2332                extra_columns.append(column)
 2333
 2334        return extra_columns
 2335
 2336    def get_extra_infos_sql(self, table: str = None) -> str:
 2337        """
 2338        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
 2339        by double quotes
 2340
 2341        :param table: The name of the table to get the extra infos from. If None, the default table is
 2342        used
 2343        :type table: str
 2344        :return: A string of the extra infos
 2345        """
 2346
 2347        return ", ".join(
 2348            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
 2349        )
 2350
 2351    def export_header(
 2352        self,
 2353        header_name: str = None,
 2354        output_file: str = None,
 2355        output_file_ext: str = ".hdr",
 2356        clean_header: bool = True,
 2357        remove_chrom_line: bool = False,
 2358    ) -> str:
 2359        """
 2360        The `export_header` function takes a VCF file, extracts the header, modifies it according to
 2361        specified options, and writes it to a new file.
 2362
 2363        :param header_name: The `header_name` parameter is the name of the header file to be created. If
 2364        this parameter is not specified, the header will be written to the output file
 2365        :type header_name: str
 2366        :param output_file: The `output_file` parameter in the `export_header` function is used to
 2367        specify the name of the output file where the header will be written. If this parameter is not
 2368        provided, the header will be written to a temporary file
 2369        :type output_file: str
 2370        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
 2371        string that represents the extension of the output header file. By default, it is set to ".hdr"
 2372        if not specified by the user. This extension will be appended to the `output_file` name to
 2373        create the final, defaults to .hdr
 2374        :type output_file_ext: str (optional)
 2375        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
 2376        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
 2377        `True`, the function will clean the header by modifying certain lines based on a specific
 2378        pattern. If `clean_header`, defaults to True
 2379        :type clean_header: bool (optional)
 2380        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
 2381        boolean flag that determines whether the #CHROM line should be removed from the header before
 2382        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
 2383        defaults to False
 2384        :type remove_chrom_line: bool (optional)
 2385        :return: The function `export_header` returns the name of the temporary header file that is
 2386        created.
 2387        """
 2388
 2389        if not header_name and not output_file:
 2390            output_file = self.get_output()
 2391
 2392        if self.get_header():
 2393
 2394            # Get header object
 2395            header_obj = self.get_header()
 2396
 2397            # Create database
 2398            db_for_header = Database(database=self.get_input())
 2399
 2400            # Get real columns in the file
 2401            db_header_columns = db_for_header.get_columns()
 2402
 2403            with tempfile.TemporaryDirectory() as tmpdir:
 2404
 2405                # Write header file
 2406                header_file_tmp = os.path.join(tmpdir, "header")
 2407                f = open(header_file_tmp, "w")
 2408                vcf.Writer(f, header_obj)
 2409                f.close()
 2410
 2411                # Replace #CHROM line with rel columns
 2412                header_list = db_for_header.read_header_file(
 2413                    header_file=header_file_tmp
 2414                )
 2415                header_list[-1] = "\t".join(db_header_columns)
 2416
 2417                # Remove CHROM line
 2418                if remove_chrom_line:
 2419                    header_list.pop()
 2420
 2421                # Clean header
 2422                if clean_header:
 2423                    header_list_clean = []
 2424                    for head in header_list:
 2425                        # Clean head for malformed header
 2426                        head_clean = head
 2427                        head_clean = re.subn(
 2428                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
 2429                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
 2430                            head_clean,
 2431                            2,
 2432                        )[0]
 2433                        # Write header
 2434                        header_list_clean.append(head_clean)
 2435                    header_list = header_list_clean
 2436
 2437            tmp_header_name = output_file + output_file_ext
 2438
 2439            f = open(tmp_header_name, "w")
 2440            for line in header_list:
 2441                f.write(line)
 2442            f.close()
 2443
 2444        return tmp_header_name
 2445
 2446    def export_variant_vcf(
 2447        self,
 2448        vcf_file,
 2449        remove_info: bool = False,
 2450        add_samples: bool = True,
 2451        list_samples: list = [],
 2452        where_clause: str = "",
 2453        index: bool = False,
 2454        threads: int | None = None,
 2455    ) -> bool | None:
 2456        """
 2457        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
 2458        remove INFO field, add samples, and control compression and indexing.
 2459
 2460        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
 2461        written to. It is the output file that will contain the filtered VCF data based on the specified
 2462        parameters
 2463        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
 2464        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
 2465        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
 2466        in, defaults to False
 2467        :type remove_info: bool (optional)
 2468        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
 2469        the samples should be added to the VCF file or not. If set to True, the samples will be added.
 2470        If set to False, the samples will be removed. The default value is True, defaults to True
 2471        :type add_samples: bool (optional)
 2472        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
 2473        in the output VCF file. By default, all samples will be included. If you provide a list of
 2474        samples, only those samples will be included in the output file
 2475        :type list_samples: list
 2476        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
 2477        determines whether or not to create an index for the output VCF file. If `index` is set to
 2478        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
 2479        :type index: bool (optional)
 2480        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
 2481        number of threads to use for exporting the VCF file. It determines how many parallel threads
 2482        will be used during the export process. More threads can potentially speed up the export process
 2483        by utilizing multiple cores of the processor. If
 2484        :type threads: int | None
 2485        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
 2486        method with various parameters including the output file, query, threads, sort flag, and index
 2487        flag. The `export_output` method is responsible for exporting the VCF data based on the
 2488        specified parameters and configurations provided in the `export_variant_vcf` function.
 2489        """
 2490
 2491        # Config
 2492        config = self.get_config()
 2493
 2494        # Extract VCF
 2495        log.debug("Export VCF...")
 2496
 2497        # Table variants
 2498        table_variants = self.get_table_variants()
 2499
 2500        # Threads
 2501        if not threads:
 2502            threads = self.get_threads()
 2503
 2504        # Info fields
 2505        if remove_info:
 2506            if not isinstance(remove_info, str):
 2507                remove_info = "."
 2508            info_field = f"""'{remove_info}' as INFO"""
 2509        else:
 2510            info_field = "INFO"
 2511
 2512        # Samples fields
 2513        if add_samples:
 2514            if not list_samples:
 2515                list_samples = self.get_header_sample_list()
 2516            if list_samples:
 2517                samples_fields = " , FORMAT , " + " , ".join(list_samples)
 2518            else:
 2519                samples_fields = ""
 2520            log.debug(f"samples_fields: {samples_fields}")
 2521        else:
 2522            samples_fields = ""
 2523
 2524        # Where clause
 2525        if where_clause is None:
 2526            where_clause = ""
 2527
 2528        # Variants
 2529        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
 2530        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
 2531        log.debug(f"sql_query_select={sql_query_select}")
 2532
 2533        return self.export_output(
 2534            output_file=vcf_file,
 2535            output_header=None,
 2536            export_header=True,
 2537            query=sql_query_select,
 2538            parquet_partitions=None,
 2539            chunk_size=config.get("chunk_size", None),
 2540            threads=threads,
 2541            sort=True,
 2542            index=index,
 2543            order_by=None,
 2544        )
 2545
 2546    def run_commands(self, commands: list = [], threads: int = 1) -> None:
 2547        """
 2548        It takes a list of commands and runs them in parallel using the number of threads specified
 2549
 2550        :param commands: A list of commands to run
 2551        :param threads: The number of threads to use, defaults to 1 (optional)
 2552        """
 2553
 2554        run_parallel_commands(commands, threads)
 2555
 2556    def get_threads(self, default: int = 1) -> int:
 2557        """
 2558        This function returns the number of threads to use for a job, with a default value of 1 if not
 2559        specified.
 2560
 2561        :param default: The `default` parameter in the `get_threads` method is used to specify the
 2562        default number of threads to use if no specific value is provided. If no value is provided for
 2563        the `threads` parameter in the configuration or input parameters, the `default` value will be
 2564        used, defaults to 1
 2565        :type default: int (optional)
 2566        :return: the number of threads to use for the current job.
 2567        """
 2568
 2569        # Config
 2570        config = self.get_config()
 2571
 2572        # Param
 2573        param = self.get_param()
 2574
 2575        # Input threads
 2576        input_thread = param.get("threads", config.get("threads", None))
 2577
 2578        # Check threads
 2579        if not input_thread:
 2580            threads = default
 2581        elif int(input_thread) <= 0:
 2582            threads = os.cpu_count()
 2583        else:
 2584            threads = int(input_thread)
 2585        return threads
 2586
 2587    def get_memory(self, default: str = None) -> str:
 2588        """
 2589        This function retrieves the memory value from parameters or configuration with a default value
 2590        if not found.
 2591
 2592        :param default: The `get_memory` function takes in a default value as a string parameter. This
 2593        default value is used as a fallback in case the `memory` parameter is not provided in the
 2594        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
 2595        the function
 2596        :type default: str
 2597        :return: The `get_memory` function returns a string value representing the memory parameter. If
 2598        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
 2599        return the default value provided as an argument to the function.
 2600        """
 2601
 2602        # Config
 2603        config = self.get_config()
 2604
 2605        # Param
 2606        param = self.get_param()
 2607
 2608        # Input threads
 2609        input_memory = param.get("memory", config.get("memory", None))
 2610
 2611        # Check threads
 2612        if input_memory:
 2613            memory = input_memory
 2614        else:
 2615            memory = default
 2616
 2617        return memory
 2618
 2619    def update_from_vcf(self, vcf_file: str) -> None:
 2620        """
 2621        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
 2622
 2623        :param vcf_file: the path to the VCF file
 2624        """
 2625
 2626        connexion_format = self.get_connexion_format()
 2627
 2628        if connexion_format in ["duckdb"]:
 2629            self.update_from_vcf_duckdb(vcf_file)
 2630        elif connexion_format in ["sqlite"]:
 2631            self.update_from_vcf_sqlite(vcf_file)
 2632
 2633    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
 2634        """
 2635        It takes a VCF file and updates the INFO column of the variants table in the database with the
 2636        INFO column of the VCF file
 2637
 2638        :param vcf_file: the path to the VCF file
 2639        """
 2640
 2641        # varaints table
 2642        table_variants = self.get_table_variants()
 2643
 2644        # Loading VCF into temporaire table
 2645        skip = self.get_header_length(file=vcf_file)
 2646        vcf_df = pd.read_csv(
 2647            vcf_file,
 2648            sep="\t",
 2649            engine="c",
 2650            skiprows=skip,
 2651            header=0,
 2652            low_memory=False,
 2653        )
 2654        sql_query_update = f"""
 2655        UPDATE {table_variants} as table_variants
 2656            SET INFO = concat(
 2657                            CASE
 2658                                WHEN INFO NOT IN ('', '.')
 2659                                THEN INFO
 2660                                ELSE ''
 2661                            END,
 2662                            (
 2663                                SELECT 
 2664                                    concat(
 2665                                        CASE
 2666                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
 2667                                            THEN ';'
 2668                                            ELSE ''
 2669                                        END
 2670                                        ,
 2671                                        CASE
 2672                                            WHEN table_parquet.INFO NOT IN ('','.')
 2673                                            THEN table_parquet.INFO
 2674                                            ELSE ''
 2675                                        END
 2676                                    )
 2677                                FROM vcf_df as table_parquet
 2678                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
 2679                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
 2680                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 2681                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
 2682                                        AND table_parquet.INFO NOT IN ('','.')
 2683                            )
 2684                        )
 2685            ;
 2686            """
 2687        self.conn.execute(sql_query_update)
 2688
 2689    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
 2690        """
 2691        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
 2692        table, then updates the INFO column of the variants table with the INFO column of the temporary
 2693        table
 2694
 2695        :param vcf_file: The path to the VCF file you want to update the database with
 2696        """
 2697
 2698        # Create a temporary table for the VCF
 2699        table_vcf = "tmp_vcf"
 2700        sql_create = (
 2701            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
 2702        )
 2703        self.conn.execute(sql_create)
 2704
 2705        # Loading VCF into temporaire table
 2706        vcf_df = pd.read_csv(
 2707            vcf_file, sep="\t", comment="#", header=None, low_memory=False
 2708        )
 2709        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
 2710        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
 2711
 2712        # Update table 'variants' with VCF data
 2713        # warning: CONCAT as || operator
 2714        sql_query_update = f"""
 2715            UPDATE variants as table_variants
 2716            SET INFO = CASE
 2717                            WHEN INFO NOT IN ('', '.')
 2718                            THEN INFO
 2719                            ELSE ''
 2720                        END ||
 2721                        (
 2722                        SELECT 
 2723                            CASE 
 2724                                WHEN table_variants.INFO NOT IN ('','.') 
 2725                                    AND table_vcf.INFO NOT IN ('','.')  
 2726                                THEN ';' 
 2727                                ELSE '' 
 2728                            END || 
 2729                            CASE 
 2730                                WHEN table_vcf.INFO NOT IN ('','.') 
 2731                                THEN table_vcf.INFO 
 2732                                ELSE '' 
 2733                            END
 2734                        FROM {table_vcf} as table_vcf
 2735                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
 2736                            AND table_vcf.\"POS\" = table_variants.\"POS\"
 2737                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
 2738                            AND table_vcf.\"REF\" = table_variants.\"REF\"
 2739                        )
 2740        """
 2741        self.conn.execute(sql_query_update)
 2742
 2743        # Drop temporary table
 2744        sql_drop = f"DROP TABLE {table_vcf}"
 2745        self.conn.execute(sql_drop)
 2746
 2747    def drop_variants_table(self) -> None:
 2748        """
 2749        > This function drops the variants table
 2750        """
 2751
 2752        table_variants = self.get_table_variants()
 2753        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
 2754        self.conn.execute(sql_table_variants)
 2755
 2756    def set_variant_id(
 2757        self, variant_id_column: str = "variant_id", force: bool = None
 2758    ) -> str:
 2759        """
 2760        It adds a column to the variants table called `variant_id` and populates it with a hash of the
 2761        `#CHROM`, `POS`, `REF`, and `ALT` columns
 2762
 2763        :param variant_id_column: The name of the column to be created in the variants table, defaults
 2764        to variant_id
 2765        :type variant_id_column: str (optional)
 2766        :param force: If True, the variant_id column will be created even if it already exists
 2767        :type force: bool
 2768        :return: The name of the column that contains the variant_id
 2769        """
 2770
 2771        # Assembly
 2772        assembly = self.get_param().get(
 2773            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 2774        )
 2775
 2776        # INFO/Tag prefix
 2777        prefix = self.get_explode_infos_prefix()
 2778
 2779        # Explode INFO/SVTYPE
 2780        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
 2781
 2782        # variants table
 2783        table_variants = self.get_table_variants()
 2784
 2785        # variant_id column
 2786        if not variant_id_column:
 2787            variant_id_column = "variant_id"
 2788
 2789        # Creta variant_id column
 2790        if "variant_id" not in self.get_extra_infos() or force:
 2791
 2792            # Create column
 2793            self.add_column(
 2794                table_name=table_variants,
 2795                column_name=variant_id_column,
 2796                column_type="UBIGINT",
 2797                default_value="0",
 2798            )
 2799
 2800            # Update column
 2801            self.conn.execute(
 2802                f"""
 2803                    UPDATE {table_variants}
 2804                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
 2805                """
 2806            )
 2807
 2808        # Remove added columns
 2809        for added_column in added_columns:
 2810            self.drop_column(column=added_column)
 2811
 2812        # return variant_id column name
 2813        return variant_id_column
 2814
 2815    def get_variant_id_column(
 2816        self, variant_id_column: str = "variant_id", force: bool = None
 2817    ) -> str:
 2818        """
 2819        This function returns the variant_id column name
 2820
 2821        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
 2822        defaults to variant_id
 2823        :type variant_id_column: str (optional)
 2824        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
 2825        False, will only set the variant_id if it is not already set. If None, will set the variant_id
 2826        if it is not already set, or if it is set
 2827        :type force: bool
 2828        :return: The variant_id column name.
 2829        """
 2830
 2831        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
 2832
 2833    ###
 2834    # Annotation
 2835    ###
 2836
 2837    def scan_databases(
 2838        self,
 2839        database_formats: list = ["parquet"],
 2840        database_releases: list = ["current"],
 2841    ) -> dict:
 2842        """
 2843        The function `scan_databases` scans for available databases based on specified formats and
 2844        releases.
 2845
 2846        :param database_formats: The `database_formats` parameter is a list that specifies the formats
 2847        of the databases to be scanned. In this case, the accepted format is "parquet"
 2848        :type database_formats: list ["parquet"]
 2849        :param database_releases: The `database_releases` parameter is a list that specifies the
 2850        releases of the databases to be scanned. In the provided function, the default value for
 2851        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
 2852        databases that are in the "current"
 2853        :type database_releases: list
 2854        :return: The function `scan_databases` returns a dictionary containing information about
 2855        databases that match the specified formats and releases.
 2856        """
 2857
 2858        # Config
 2859        config = self.get_config()
 2860
 2861        # Param
 2862        param = self.get_param()
 2863
 2864        # Param - Assembly
 2865        assembly = param.get("assembly", config.get("assembly", None))
 2866        if not assembly:
 2867            assembly = DEFAULT_ASSEMBLY
 2868            log.warning(f"Default assembly '{assembly}'")
 2869
 2870        # Scan for availabled databases
 2871        log.info(
 2872            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
 2873        )
 2874        databases_infos_dict = databases_infos(
 2875            database_folder_releases=database_releases,
 2876            database_formats=database_formats,
 2877            assembly=assembly,
 2878            config=config,
 2879        )
 2880        log.info(
 2881            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
 2882        )
 2883
 2884        return databases_infos_dict
 2885
 2886    def annotation(self) -> None:
 2887        """
 2888        It annotates the VCF file with the annotations specified in the config file.
 2889        """
 2890
 2891        # Config
 2892        config = self.get_config()
 2893
 2894        # Param
 2895        param = self.get_param()
 2896
 2897        # Param - Assembly
 2898        assembly = param.get("assembly", config.get("assembly", None))
 2899        if not assembly:
 2900            assembly = DEFAULT_ASSEMBLY
 2901            log.warning(f"Default assembly '{assembly}'")
 2902
 2903        # annotations databases folders
 2904        annotations_databases = set(
 2905            config.get("folders", {})
 2906            .get("databases", {})
 2907            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
 2908            + config.get("folders", {})
 2909            .get("databases", {})
 2910            .get("parquet", ["~/howard/databases/parquet/current"])
 2911            + config.get("folders", {})
 2912            .get("databases", {})
 2913            .get("bcftools", ["~/howard/databases/bcftools/current"])
 2914        )
 2915
 2916        # Get param annotations
 2917        if param.get("annotations", None) and isinstance(
 2918            param.get("annotations", None), str
 2919        ):
 2920            log.debug(param.get("annotations", None))
 2921            param_annotation_list = param.get("annotations").split(",")
 2922        else:
 2923            param_annotation_list = []
 2924
 2925        # Each tools param
 2926        if param.get("annotation_parquet", None) != None:
 2927            log.debug(
 2928                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
 2929            )
 2930            if isinstance(param.get("annotation_parquet", None), list):
 2931                param_annotation_list.append(",".join(param.get("annotation_parquet")))
 2932            else:
 2933                param_annotation_list.append(param.get("annotation_parquet"))
 2934        if param.get("annotation_snpsift", None) != None:
 2935            if isinstance(param.get("annotation_snpsift", None), list):
 2936                param_annotation_list.append(
 2937                    "snpsift:"
 2938                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
 2939                )
 2940            else:
 2941                param_annotation_list.append(
 2942                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
 2943                )
 2944        if param.get("annotation_snpeff", None) != None:
 2945            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
 2946        if param.get("annotation_bcftools", None) != None:
 2947            if isinstance(param.get("annotation_bcftools", None), list):
 2948                param_annotation_list.append(
 2949                    "bcftools:"
 2950                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
 2951                )
 2952            else:
 2953                param_annotation_list.append(
 2954                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
 2955                )
 2956        if param.get("annotation_annovar", None) != None:
 2957            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
 2958        if param.get("annotation_exomiser", None) != None:
 2959            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
 2960        if param.get("annotation_splice", None) != None:
 2961            param_annotation_list.append("splice:" + param.get("annotation_splice"))
 2962
 2963        # Merge param annotations list
 2964        param["annotations"] = ",".join(param_annotation_list)
 2965
 2966        # debug
 2967        log.debug(f"param_annotations={param['annotations']}")
 2968
 2969        if param.get("annotations"):
 2970
 2971            # Log
 2972            # log.info("Annotations - Check annotation parameters")
 2973
 2974            if not "annotation" in param:
 2975                param["annotation"] = {}
 2976
 2977            # List of annotations parameters
 2978            annotations_list_input = {}
 2979            if isinstance(param.get("annotations", None), str):
 2980                annotation_file_list = [
 2981                    value for value in param.get("annotations", "").split(",")
 2982                ]
 2983                for annotation_file in annotation_file_list:
 2984                    annotations_list_input[annotation_file.strip()] = {"INFO": None}
 2985            else:
 2986                annotations_list_input = param.get("annotations", {})
 2987
 2988            log.info(f"Quick Annotations:")
 2989            for annotation_key in list(annotations_list_input.keys()):
 2990                log.info(f"   {annotation_key}")
 2991
 2992            # List of annotations and associated fields
 2993            annotations_list = {}
 2994
 2995            for annotation_file in annotations_list_input:
 2996
 2997                # Explode annotations if ALL
 2998                if (
 2999                    annotation_file.upper() == "ALL"
 3000                    or annotation_file.upper().startswith("ALL:")
 3001                ):
 3002
 3003                    # check ALL parameters (formats, releases)
 3004                    annotation_file_split = annotation_file.split(":")
 3005                    database_formats = "parquet"
 3006                    database_releases = "current"
 3007                    for annotation_file_option in annotation_file_split[1:]:
 3008                        database_all_options_split = annotation_file_option.split("=")
 3009                        if database_all_options_split[0] == "format":
 3010                            database_formats = database_all_options_split[1].split("+")
 3011                        if database_all_options_split[0] == "release":
 3012                            database_releases = database_all_options_split[1].split("+")
 3013
 3014                    # Scan for availabled databases
 3015                    databases_infos_dict = self.scan_databases(
 3016                        database_formats=database_formats,
 3017                        database_releases=database_releases,
 3018                    )
 3019
 3020                    # Add found databases in annotation parameters
 3021                    for database_infos in databases_infos_dict.keys():
 3022                        annotations_list[database_infos] = {"INFO": None}
 3023
 3024                else:
 3025                    annotations_list[annotation_file] = annotations_list_input[
 3026                        annotation_file
 3027                    ]
 3028
 3029            # Check each databases
 3030            if len(annotations_list):
 3031
 3032                log.info(
 3033                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
 3034                )
 3035
 3036                for annotation_file in annotations_list:
 3037
 3038                    # Init
 3039                    annotations = annotations_list.get(annotation_file, None)
 3040
 3041                    # Annotation snpEff
 3042                    if annotation_file.startswith("snpeff"):
 3043
 3044                        log.debug(f"Quick Annotation snpEff")
 3045
 3046                        if "snpeff" not in param["annotation"]:
 3047                            param["annotation"]["snpeff"] = {}
 3048
 3049                        if "options" not in param["annotation"]["snpeff"]:
 3050                            param["annotation"]["snpeff"]["options"] = ""
 3051
 3052                        # snpEff options in annotations
 3053                        param["annotation"]["snpeff"]["options"] = "".join(
 3054                            annotation_file.split(":")[1:]
 3055                        )
 3056
 3057                    # Annotation Annovar
 3058                    elif annotation_file.startswith("annovar"):
 3059
 3060                        log.debug(f"Quick Annotation Annovar")
 3061
 3062                        if "annovar" not in param["annotation"]:
 3063                            param["annotation"]["annovar"] = {}
 3064
 3065                        if "annotations" not in param["annotation"]["annovar"]:
 3066                            param["annotation"]["annovar"]["annotations"] = {}
 3067
 3068                        # Options
 3069                        annotation_file_split = annotation_file.split(":")
 3070                        for annotation_file_annotation in annotation_file_split[1:]:
 3071                            if annotation_file_annotation:
 3072                                param["annotation"]["annovar"]["annotations"][
 3073                                    annotation_file_annotation
 3074                                ] = annotations
 3075
 3076                    # Annotation Exomiser
 3077                    elif annotation_file.startswith("exomiser"):
 3078
 3079                        log.debug(f"Quick Annotation Exomiser")
 3080
 3081                        param["annotation"]["exomiser"] = params_string_to_dict(
 3082                            annotation_file
 3083                        )
 3084
 3085                    # Annotation Splice
 3086                    elif annotation_file.startswith("splice"):
 3087
 3088                        log.debug(f"Quick Annotation Splice")
 3089
 3090                        param["annotation"]["splice"] = params_string_to_dict(
 3091                            annotation_file
 3092                        )
 3093
 3094                    # Annotation Parquet or BCFTOOLS
 3095                    else:
 3096
 3097                        # Tools detection
 3098                        if annotation_file.startswith("bcftools:"):
 3099                            annotation_tool_initial = "bcftools"
 3100                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3101                        elif annotation_file.startswith("snpsift:"):
 3102                            annotation_tool_initial = "snpsift"
 3103                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3104                        elif annotation_file.startswith("bigwig:"):
 3105                            annotation_tool_initial = "bigwig"
 3106                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3107                        else:
 3108                            annotation_tool_initial = None
 3109
 3110                        # list of files
 3111                        annotation_file_list = annotation_file.replace("+", ":").split(
 3112                            ":"
 3113                        )
 3114
 3115                        for annotation_file in annotation_file_list:
 3116
 3117                            if annotation_file:
 3118
 3119                                # Annotation tool initial
 3120                                annotation_tool = annotation_tool_initial
 3121
 3122                                # Find file
 3123                                annotation_file_found = None
 3124
 3125                                if os.path.exists(annotation_file):
 3126                                    annotation_file_found = annotation_file
 3127                                elif os.path.exists(full_path(annotation_file)):
 3128                                    annotation_file_found = full_path(annotation_file)
 3129                                else:
 3130                                    # Find within assembly folders
 3131                                    for annotations_database in annotations_databases:
 3132                                        found_files = find_all(
 3133                                            annotation_file,
 3134                                            os.path.join(
 3135                                                annotations_database, assembly
 3136                                            ),
 3137                                        )
 3138                                        if len(found_files) > 0:
 3139                                            annotation_file_found = found_files[0]
 3140                                            break
 3141                                    if not annotation_file_found and not assembly:
 3142                                        # Find within folders
 3143                                        for (
 3144                                            annotations_database
 3145                                        ) in annotations_databases:
 3146                                            found_files = find_all(
 3147                                                annotation_file, annotations_database
 3148                                            )
 3149                                            if len(found_files) > 0:
 3150                                                annotation_file_found = found_files[0]
 3151                                                break
 3152                                log.debug(
 3153                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
 3154                                )
 3155
 3156                                # Full path
 3157                                annotation_file_found = full_path(annotation_file_found)
 3158
 3159                                if annotation_file_found:
 3160
 3161                                    database = Database(database=annotation_file_found)
 3162                                    quick_annotation_format = database.get_format()
 3163                                    quick_annotation_is_compressed = (
 3164                                        database.is_compressed()
 3165                                    )
 3166                                    quick_annotation_is_indexed = os.path.exists(
 3167                                        f"{annotation_file_found}.tbi"
 3168                                    )
 3169                                    bcftools_preference = False
 3170
 3171                                    # Check Annotation Tool
 3172                                    if not annotation_tool:
 3173                                        if (
 3174                                            bcftools_preference
 3175                                            and quick_annotation_format
 3176                                            in ["vcf", "bed"]
 3177                                            and quick_annotation_is_compressed
 3178                                            and quick_annotation_is_indexed
 3179                                        ):
 3180                                            annotation_tool = "bcftools"
 3181                                        elif quick_annotation_format in [
 3182                                            "vcf",
 3183                                            "bed",
 3184                                            "tsv",
 3185                                            "tsv",
 3186                                            "csv",
 3187                                            "json",
 3188                                            "tbl",
 3189                                            "parquet",
 3190                                            "duckdb",
 3191                                        ]:
 3192                                            annotation_tool = "parquet"
 3193                                        elif quick_annotation_format in ["bw"]:
 3194                                            annotation_tool = "bigwig"
 3195                                        else:
 3196                                            log.error(
 3197                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3198                                            )
 3199                                            raise ValueError(
 3200                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3201                                            )
 3202
 3203                                    log.debug(
 3204                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
 3205                                    )
 3206
 3207                                    # Annotation Tool dispatch
 3208                                    if annotation_tool:
 3209                                        if annotation_tool not in param["annotation"]:
 3210                                            param["annotation"][annotation_tool] = {}
 3211                                        if (
 3212                                            "annotations"
 3213                                            not in param["annotation"][annotation_tool]
 3214                                        ):
 3215                                            param["annotation"][annotation_tool][
 3216                                                "annotations"
 3217                                            ] = {}
 3218                                        param["annotation"][annotation_tool][
 3219                                            "annotations"
 3220                                        ][annotation_file_found] = annotations
 3221
 3222                                else:
 3223                                    log.warning(
 3224                                        f"Quick Annotation File {annotation_file} does NOT exist"
 3225                                    )
 3226
 3227                self.set_param(param)
 3228
 3229        if param.get("annotation", None):
 3230            log.info("Annotations")
 3231            if param.get("annotation", {}).get("parquet", None):
 3232                log.info("Annotations 'parquet'...")
 3233                self.annotation_parquet()
 3234            if param.get("annotation", {}).get("bcftools", None):
 3235                log.info("Annotations 'bcftools'...")
 3236                self.annotation_bcftools()
 3237            if param.get("annotation", {}).get("snpsift", None):
 3238                log.info("Annotations 'snpsift'...")
 3239                self.annotation_snpsift()
 3240            if param.get("annotation", {}).get("bigwig", None):
 3241                log.info("Annotations 'bigwig'...")
 3242                self.annotation_bigwig()
 3243            if param.get("annotation", {}).get("annovar", None):
 3244                log.info("Annotations 'annovar'...")
 3245                self.annotation_annovar()
 3246            if param.get("annotation", {}).get("snpeff", None):
 3247                log.info("Annotations 'snpeff'...")
 3248                self.annotation_snpeff()
 3249            if param.get("annotation", {}).get("exomiser", None) is not None:
 3250                log.info("Annotations 'exomiser'...")
 3251                self.annotation_exomiser()
 3252            if param.get("annotation", {}).get("splice", None) is not None:
 3253                log.info("Annotations 'splice' ...")
 3254                self.annotation_splice()
 3255
 3256        # Explode INFOS fields into table fields
 3257        if self.get_explode_infos():
 3258            self.explode_infos(
 3259                prefix=self.get_explode_infos_prefix(),
 3260                fields=self.get_explode_infos_fields(),
 3261                force=True,
 3262            )
 3263
 3264    def annotation_bigwig(self, threads: int = None) -> None:
 3265        """
 3266        The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases.
 3267
 3268        :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the
 3269        number of threads to be used for parallel processing during the annotation process. If the
 3270        `threads` parameter is not provided, the method will attempt to determine the optimal number of
 3271        threads to use based on the system configuration
 3272        :type threads: int
 3273        :return: True
 3274        """
 3275
 3276        # DEBUG
 3277        log.debug("Start annotation with bigwig databases")
 3278
 3279        # # Threads
 3280        # if not threads:
 3281        #     threads = self.get_threads()
 3282        # log.debug("Threads: " + str(threads))
 3283
 3284        # Config
 3285        config = self.get_config()
 3286        log.debug("Config: " + str(config))
 3287
 3288        # Config - BCFTools databases folders
 3289        databases_folders = set(
 3290            self.get_config()
 3291            .get("folders", {})
 3292            .get("databases", {})
 3293            .get("annotations", ["."])
 3294            + self.get_config()
 3295            .get("folders", {})
 3296            .get("databases", {})
 3297            .get("bigwig", ["."])
 3298        )
 3299        log.debug("Databases annotations: " + str(databases_folders))
 3300
 3301        # Param
 3302        annotations = (
 3303            self.get_param()
 3304            .get("annotation", {})
 3305            .get("bigwig", {})
 3306            .get("annotations", None)
 3307        )
 3308        log.debug("Annotations: " + str(annotations))
 3309
 3310        # Assembly
 3311        assembly = self.get_param().get(
 3312            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3313        )
 3314
 3315        # Data
 3316        table_variants = self.get_table_variants()
 3317
 3318        # Check if not empty
 3319        log.debug("Check if not empty")
 3320        sql_query_chromosomes = (
 3321            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3322        )
 3323        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3324        if not sql_query_chromosomes_df["count"][0]:
 3325            log.info(f"VCF empty")
 3326            return
 3327
 3328        # VCF header
 3329        vcf_reader = self.get_header()
 3330        log.debug("Initial header: " + str(vcf_reader.infos))
 3331
 3332        # Existing annotations
 3333        for vcf_annotation in self.get_header().infos:
 3334
 3335            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3336            log.debug(
 3337                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3338            )
 3339
 3340        if annotations:
 3341
 3342            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3343
 3344                # Export VCF file
 3345                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3346
 3347                # annotation_bigwig_config
 3348                annotation_bigwig_config_list = []
 3349
 3350                for annotation in annotations:
 3351                    annotation_fields = annotations[annotation]
 3352
 3353                    # Annotation Name
 3354                    annotation_name = os.path.basename(annotation)
 3355
 3356                    if not annotation_fields:
 3357                        annotation_fields = {"INFO": None}
 3358
 3359                    log.debug(f"Annotation '{annotation_name}'")
 3360                    log.debug(
 3361                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3362                    )
 3363
 3364                    # Create Database
 3365                    database = Database(
 3366                        database=annotation,
 3367                        databases_folders=databases_folders,
 3368                        assembly=assembly,
 3369                    )
 3370
 3371                    # Find files
 3372                    db_file = database.get_database()
 3373                    db_file = full_path(db_file)
 3374                    db_hdr_file = database.get_header_file()
 3375                    db_hdr_file = full_path(db_hdr_file)
 3376                    db_file_type = database.get_format()
 3377
 3378                    # If db_file is http ?
 3379                    if database.get_database().startswith("http"):
 3380
 3381                        # Datbase is HTTP URL
 3382                        db_file_is_http = True
 3383
 3384                        # DB file keep as URL
 3385                        db_file = database.get_database()
 3386                        log.warning(
 3387                            f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)"
 3388                        )
 3389
 3390                        # Retrieve automatic annotation field name
 3391                        annotation_field = clean_annotation_field(
 3392                            os.path.basename(db_file).replace(".bw", "")
 3393                        )
 3394                        log.debug(
 3395                            f"Create header file with annotation field '{annotation_field}' is an HTTP URL"
 3396                        )
 3397
 3398                        # Create automatic header file
 3399                        db_hdr_file = os.path.join(tmp_dir, "header.hdr")
 3400                        with open(db_hdr_file, "w") as f:
 3401                            f.write("##fileformat=VCFv4.2\n")
 3402                            f.write(
 3403                                f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n"""
 3404                            )
 3405                            f.write(f"#CHROM	START	END	{annotation_field}\n")
 3406
 3407                    else:
 3408
 3409                        # Datbase is NOT HTTP URL
 3410                        db_file_is_http = False
 3411
 3412                    # Check index - try to create if not exists
 3413                    if (
 3414                        db_file is None
 3415                        or db_hdr_file is None
 3416                        or (not os.path.exists(db_file) and not db_file_is_http)
 3417                        or not os.path.exists(db_hdr_file)
 3418                        or not db_file_type in ["bw"]
 3419                    ):
 3420                        # if False:
 3421                        log.error("Annotation failed: database not valid")
 3422                        log.error(f"Annotation annotation file: {db_file}")
 3423                        log.error(f"Annotation annotation file type: {db_file_type}")
 3424                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3425                        raise ValueError(
 3426                            f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}"
 3427                        )
 3428                    else:
 3429
 3430                        # Log
 3431                        log.debug(
 3432                            f"Annotation '{annotation}' - file: "
 3433                            + str(db_file)
 3434                            + " and "
 3435                            + str(db_hdr_file)
 3436                        )
 3437
 3438                        # Load header as VCF object
 3439                        db_hdr_vcf = Variants(input=db_hdr_file)
 3440                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3441                        log.debug(
 3442                            "Annotation database header: "
 3443                            + str(db_hdr_vcf_header_infos)
 3444                        )
 3445
 3446                        # For all fields in database
 3447                        annotation_fields_full = False
 3448                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3449                            annotation_fields = {
 3450                                key: key for key in db_hdr_vcf_header_infos
 3451                            }
 3452                            log.debug(
 3453                                "Annotation database header - All annotations added: "
 3454                                + str(annotation_fields)
 3455                            )
 3456                            annotation_fields_full = True
 3457
 3458                        # Init
 3459                        cyvcf2_header_rename_dict = {}
 3460                        cyvcf2_header_list = []
 3461                        cyvcf2_header_indexes = {}
 3462
 3463                        # process annotation fields
 3464                        for annotation_field in annotation_fields:
 3465
 3466                            # New annotation name
 3467                            annotation_field_new = annotation_fields[annotation_field]
 3468
 3469                            # Check annotation field and index in header
 3470                            if (
 3471                                annotation_field
 3472                                in db_hdr_vcf.get_header_columns_as_list()
 3473                            ):
 3474                                annotation_field_index = (
 3475                                    db_hdr_vcf.get_header_columns_as_list().index(
 3476                                        annotation_field
 3477                                    )
 3478                                    - 3
 3479                                )
 3480                                cyvcf2_header_indexes[annotation_field_new] = (
 3481                                    annotation_field_index
 3482                                )
 3483                            else:
 3484                                msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'"
 3485                                log.error(msg_err)
 3486                                raise ValueError(msg_err)
 3487
 3488                            # Append annotation field in cyvcf2 header list
 3489                            cyvcf2_header_rename_dict[annotation_field_new] = (
 3490                                db_hdr_vcf_header_infos[annotation_field].id
 3491                            )
 3492                            cyvcf2_header_list.append(
 3493                                {
 3494                                    "ID": annotation_field_new,
 3495                                    "Number": db_hdr_vcf_header_infos[
 3496                                        annotation_field
 3497                                    ].num,
 3498                                    "Type": db_hdr_vcf_header_infos[
 3499                                        annotation_field
 3500                                    ].type,
 3501                                    "Description": db_hdr_vcf_header_infos[
 3502                                        annotation_field
 3503                                    ].desc,
 3504                                }
 3505                            )
 3506
 3507                            # Add header on VCF
 3508                            vcf_reader.infos[annotation_field_new] = vcf.parser._Info(
 3509                                annotation_field_new,
 3510                                db_hdr_vcf_header_infos[annotation_field].num,
 3511                                db_hdr_vcf_header_infos[annotation_field].type,
 3512                                db_hdr_vcf_header_infos[annotation_field].desc,
 3513                                "HOWARD BigWig annotation",
 3514                                "unknown",
 3515                                self.code_type_map[
 3516                                    db_hdr_vcf_header_infos[annotation_field].type
 3517                                ],
 3518                            )
 3519
 3520                        # Load bigwig database
 3521                        bw_db = pyBigWig.open(db_file)
 3522                        if bw_db.isBigWig():
 3523                            log.debug(f"Database '{db_file}' is in 'BigWig' format")
 3524                        else:
 3525                            msg_err = f"Database '{db_file}' is NOT in 'BigWig' format"
 3526                            log.error(msg_err)
 3527                            raise ValueError(msg_err)
 3528
 3529                        annotation_bigwig_config_list.append(
 3530                            {
 3531                                "db_file": db_file,
 3532                                "bw_db": bw_db,
 3533                                "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict,
 3534                                "cyvcf2_header_list": cyvcf2_header_list,
 3535                                "cyvcf2_header_indexes": cyvcf2_header_indexes,
 3536                            }
 3537                        )
 3538
 3539                # Annotate
 3540                if annotation_bigwig_config_list:
 3541
 3542                    # Annotation config
 3543                    log.debug(
 3544                        f"annotation_bigwig_config={annotation_bigwig_config_list}"
 3545                    )
 3546
 3547                    # Export VCF file
 3548                    self.export_variant_vcf(
 3549                        vcf_file=tmp_vcf_name,
 3550                        remove_info=True,
 3551                        add_samples=False,
 3552                        index=True,
 3553                    )
 3554
 3555                    # Load input tmp file
 3556                    input_vcf = cyvcf2.VCF(tmp_vcf_name)
 3557
 3558                    # Add header in input file
 3559                    for annotation_bigwig_config in annotation_bigwig_config_list:
 3560                        for cyvcf2_header_field in annotation_bigwig_config.get(
 3561                            "cyvcf2_header_list", []
 3562                        ):
 3563                            log.info(
 3564                                f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'"
 3565                            )
 3566                            input_vcf.add_info_to_header(cyvcf2_header_field)
 3567
 3568                    # Create output VCF file
 3569                    output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz")
 3570                    output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf)
 3571
 3572                    # Fetch variants
 3573                    log.info(f"Annotations 'bigwig' start...")
 3574                    for variant in input_vcf:
 3575
 3576                        for annotation_bigwig_config in annotation_bigwig_config_list:
 3577
 3578                            # DB and indexes
 3579                            bw_db = annotation_bigwig_config.get("bw_db", None)
 3580                            cyvcf2_header_indexes = annotation_bigwig_config.get(
 3581                                "cyvcf2_header_indexes", None
 3582                            )
 3583
 3584                            # Retrieve value from chrom pos
 3585                            res = bw_db.values(
 3586                                variant.CHROM, variant.POS - 1, variant.POS
 3587                            )
 3588
 3589                            # For each annotation fields (and indexes)
 3590                            for cyvcf2_header_index in cyvcf2_header_indexes:
 3591
 3592                                # If value is NOT nNone
 3593                                if not np.isnan(
 3594                                    res[cyvcf2_header_indexes[cyvcf2_header_index]]
 3595                                ):
 3596                                    variant.INFO[cyvcf2_header_index] = res[
 3597                                        cyvcf2_header_indexes[cyvcf2_header_index]
 3598                                    ]
 3599
 3600                        # Add record in output file
 3601                        output_vcf.write_record(variant)
 3602
 3603                    # Log
 3604                    log.debug(f"Annotation done.")
 3605
 3606                    # Close and write file
 3607                    log.info(f"Annotations 'bigwig' write...")
 3608                    output_vcf.close()
 3609                    log.debug(f"Write done.")
 3610
 3611                    # Update variants
 3612                    log.info(f"Annotations 'bigwig' update...")
 3613                    self.update_from_vcf(output_vcf_file)
 3614                    log.debug(f"Update done.")
 3615
 3616        return True
 3617
 3618    def annotation_snpsift(self, threads: int = None) -> None:
 3619        """
 3620        This function annotate with bcftools
 3621
 3622        :param threads: Number of threads to use
 3623        :return: the value of the variable "return_value".
 3624        """
 3625
 3626        # DEBUG
 3627        log.debug("Start annotation with bcftools databases")
 3628
 3629        # Threads
 3630        if not threads:
 3631            threads = self.get_threads()
 3632        log.debug("Threads: " + str(threads))
 3633
 3634        # Config
 3635        config = self.get_config()
 3636        log.debug("Config: " + str(config))
 3637
 3638        # Config - snpSift
 3639        snpsift_bin_command = get_bin_command(
 3640            bin="SnpSift.jar",
 3641            tool="snpsift",
 3642            bin_type="jar",
 3643            config=config,
 3644            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 3645        )
 3646        if not snpsift_bin_command:
 3647            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
 3648            log.error(msg_err)
 3649            raise ValueError(msg_err)
 3650
 3651        # Config - bcftools
 3652        bcftools_bin_command = get_bin_command(
 3653            bin="bcftools",
 3654            tool="bcftools",
 3655            bin_type="bin",
 3656            config=config,
 3657            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3658        )
 3659        if not bcftools_bin_command:
 3660            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3661            log.error(msg_err)
 3662            raise ValueError(msg_err)
 3663
 3664        # Config - BCFTools databases folders
 3665        databases_folders = set(
 3666            self.get_config()
 3667            .get("folders", {})
 3668            .get("databases", {})
 3669            .get("annotations", ["."])
 3670            + self.get_config()
 3671            .get("folders", {})
 3672            .get("databases", {})
 3673            .get("bcftools", ["."])
 3674        )
 3675        log.debug("Databases annotations: " + str(databases_folders))
 3676
 3677        # Param
 3678        annotations = (
 3679            self.get_param()
 3680            .get("annotation", {})
 3681            .get("snpsift", {})
 3682            .get("annotations", None)
 3683        )
 3684        log.debug("Annotations: " + str(annotations))
 3685
 3686        # Assembly
 3687        assembly = self.get_param().get(
 3688            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3689        )
 3690
 3691        # Data
 3692        table_variants = self.get_table_variants()
 3693
 3694        # Check if not empty
 3695        log.debug("Check if not empty")
 3696        sql_query_chromosomes = (
 3697            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3698        )
 3699        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3700        if not sql_query_chromosomes_df["count"][0]:
 3701            log.info(f"VCF empty")
 3702            return
 3703
 3704        # VCF header
 3705        vcf_reader = self.get_header()
 3706        log.debug("Initial header: " + str(vcf_reader.infos))
 3707
 3708        # Existing annotations
 3709        for vcf_annotation in self.get_header().infos:
 3710
 3711            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3712            log.debug(
 3713                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3714            )
 3715
 3716        if annotations:
 3717
 3718            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3719
 3720                # Export VCF file
 3721                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3722
 3723                # Init
 3724                commands = {}
 3725
 3726                for annotation in annotations:
 3727                    annotation_fields = annotations[annotation]
 3728
 3729                    # Annotation Name
 3730                    annotation_name = os.path.basename(annotation)
 3731
 3732                    if not annotation_fields:
 3733                        annotation_fields = {"INFO": None}
 3734
 3735                    log.debug(f"Annotation '{annotation_name}'")
 3736                    log.debug(
 3737                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3738                    )
 3739
 3740                    # Create Database
 3741                    database = Database(
 3742                        database=annotation,
 3743                        databases_folders=databases_folders,
 3744                        assembly=assembly,
 3745                    )
 3746
 3747                    # Find files
 3748                    db_file = database.get_database()
 3749                    db_file = full_path(db_file)
 3750                    db_hdr_file = database.get_header_file()
 3751                    db_hdr_file = full_path(db_hdr_file)
 3752                    db_file_type = database.get_format()
 3753                    db_tbi_file = f"{db_file}.tbi"
 3754                    db_file_compressed = database.is_compressed()
 3755
 3756                    # Check if compressed
 3757                    if not db_file_compressed:
 3758                        log.error(
 3759                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3760                        )
 3761                        raise ValueError(
 3762                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3763                        )
 3764
 3765                    # Check if indexed
 3766                    if not os.path.exists(db_tbi_file):
 3767                        log.error(
 3768                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3769                        )
 3770                        raise ValueError(
 3771                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3772                        )
 3773
 3774                    # Check index - try to create if not exists
 3775                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3776                        log.error("Annotation failed: database not valid")
 3777                        log.error(f"Annotation annotation file: {db_file}")
 3778                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3779                        log.error(f"Annotation annotation index: {db_tbi_file}")
 3780                        raise ValueError(
 3781                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3782                        )
 3783                    else:
 3784
 3785                        log.debug(
 3786                            f"Annotation '{annotation}' - file: "
 3787                            + str(db_file)
 3788                            + " and "
 3789                            + str(db_hdr_file)
 3790                        )
 3791
 3792                        # Load header as VCF object
 3793                        db_hdr_vcf = Variants(input=db_hdr_file)
 3794                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3795                        log.debug(
 3796                            "Annotation database header: "
 3797                            + str(db_hdr_vcf_header_infos)
 3798                        )
 3799
 3800                        # For all fields in database
 3801                        annotation_fields_full = False
 3802                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3803                            annotation_fields = {
 3804                                key: key for key in db_hdr_vcf_header_infos
 3805                            }
 3806                            log.debug(
 3807                                "Annotation database header - All annotations added: "
 3808                                + str(annotation_fields)
 3809                            )
 3810                            annotation_fields_full = True
 3811
 3812                        # # Create file for field rename
 3813                        # log.debug("Create file for field rename")
 3814                        # tmp_rename = NamedTemporaryFile(
 3815                        #     prefix=self.get_prefix(),
 3816                        #     dir=self.get_tmp_dir(),
 3817                        #     suffix=".rename",
 3818                        #     delete=False,
 3819                        # )
 3820                        # tmp_rename_name = tmp_rename.name
 3821                        # tmp_files.append(tmp_rename_name)
 3822
 3823                        # Number of fields
 3824                        nb_annotation_field = 0
 3825                        annotation_list = []
 3826                        annotation_infos_rename_list = []
 3827
 3828                        for annotation_field in annotation_fields:
 3829
 3830                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3831                            annotation_fields_new_name = annotation_fields.get(
 3832                                annotation_field, annotation_field
 3833                            )
 3834                            if not annotation_fields_new_name:
 3835                                annotation_fields_new_name = annotation_field
 3836
 3837                            # Check if field is in DB and if field is not elready in input data
 3838                            if (
 3839                                annotation_field in db_hdr_vcf.get_header().infos
 3840                                and annotation_fields_new_name
 3841                                not in self.get_header().infos
 3842                            ):
 3843
 3844                                log.info(
 3845                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3846                                )
 3847
 3848                                # BCFTools annotate param to rename fields
 3849                                if annotation_field != annotation_fields_new_name:
 3850                                    annotation_infos_rename_list.append(
 3851                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3852                                    )
 3853
 3854                                # Add INFO field to header
 3855                                db_hdr_vcf_header_infos_number = (
 3856                                    db_hdr_vcf_header_infos[annotation_field].num or "."
 3857                                )
 3858                                db_hdr_vcf_header_infos_type = (
 3859                                    db_hdr_vcf_header_infos[annotation_field].type
 3860                                    or "String"
 3861                                )
 3862                                db_hdr_vcf_header_infos_description = (
 3863                                    db_hdr_vcf_header_infos[annotation_field].desc
 3864                                    or f"{annotation_field} description"
 3865                                )
 3866                                db_hdr_vcf_header_infos_source = (
 3867                                    db_hdr_vcf_header_infos[annotation_field].source
 3868                                    or "unknown"
 3869                                )
 3870                                db_hdr_vcf_header_infos_version = (
 3871                                    db_hdr_vcf_header_infos[annotation_field].version
 3872                                    or "unknown"
 3873                                )
 3874
 3875                                vcf_reader.infos[annotation_fields_new_name] = (
 3876                                    vcf.parser._Info(
 3877                                        annotation_fields_new_name,
 3878                                        db_hdr_vcf_header_infos_number,
 3879                                        db_hdr_vcf_header_infos_type,
 3880                                        db_hdr_vcf_header_infos_description,
 3881                                        db_hdr_vcf_header_infos_source,
 3882                                        db_hdr_vcf_header_infos_version,
 3883                                        self.code_type_map[
 3884                                            db_hdr_vcf_header_infos_type
 3885                                        ],
 3886                                    )
 3887                                )
 3888
 3889                                annotation_list.append(annotation_field)
 3890
 3891                                nb_annotation_field += 1
 3892
 3893                            else:
 3894
 3895                                if (
 3896                                    annotation_field
 3897                                    not in db_hdr_vcf.get_header().infos
 3898                                ):
 3899                                    log.warning(
 3900                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
 3901                                    )
 3902                                if (
 3903                                    annotation_fields_new_name
 3904                                    in self.get_header().infos
 3905                                ):
 3906                                    log.warning(
 3907                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3908                                    )
 3909
 3910                        log.info(
 3911                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3912                        )
 3913
 3914                        annotation_infos = ",".join(annotation_list)
 3915
 3916                        if annotation_infos != "":
 3917
 3918                            # Annotated VCF (and error file)
 3919                            tmp_annotation_vcf_name = os.path.join(
 3920                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
 3921                            )
 3922                            tmp_annotation_vcf_name_err = (
 3923                                tmp_annotation_vcf_name + ".err"
 3924                            )
 3925
 3926                            # Add fields to annotate
 3927                            if not annotation_fields_full:
 3928                                annotation_infos_option = f"-info {annotation_infos}"
 3929                            else:
 3930                                annotation_infos_option = ""
 3931
 3932                            # Info fields rename
 3933                            if annotation_infos_rename_list:
 3934                                annotation_infos_rename = " -c " + ",".join(
 3935                                    annotation_infos_rename_list
 3936                                )
 3937                            else:
 3938                                annotation_infos_rename = ""
 3939
 3940                            # Annotate command
 3941                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3942
 3943                            # Add command
 3944                            commands[command_annotate] = tmp_annotation_vcf_name
 3945
 3946                if commands:
 3947
 3948                    # Export VCF file
 3949                    self.export_variant_vcf(
 3950                        vcf_file=tmp_vcf_name,
 3951                        remove_info=True,
 3952                        add_samples=False,
 3953                        index=True,
 3954                    )
 3955                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
 3956
 3957                    # Num command
 3958                    nb_command = 0
 3959
 3960                    # Annotate
 3961                    for command_annotate in commands:
 3962                        nb_command += 1
 3963                        log.info(
 3964                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
 3965                        )
 3966                        log.debug(f"command_annotate={command_annotate}")
 3967                        run_parallel_commands([command_annotate], threads)
 3968
 3969                        # Debug
 3970                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
 3971
 3972                        # Update variants
 3973                        log.info(
 3974                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
 3975                        )
 3976                        self.update_from_vcf(commands[command_annotate])
 3977
 3978    def annotation_bcftools(self, threads: int = None) -> None:
 3979        """
 3980        This function annotate with bcftools
 3981
 3982        :param threads: Number of threads to use
 3983        :return: the value of the variable "return_value".
 3984        """
 3985
 3986        # DEBUG
 3987        log.debug("Start annotation with bcftools databases")
 3988
 3989        # Threads
 3990        if not threads:
 3991            threads = self.get_threads()
 3992        log.debug("Threads: " + str(threads))
 3993
 3994        # Config
 3995        config = self.get_config()
 3996        log.debug("Config: " + str(config))
 3997
 3998        # DEBUG
 3999        delete_tmp = True
 4000        if self.get_config().get("verbosity", "warning") in ["debug"]:
 4001            delete_tmp = False
 4002            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 4003
 4004        # Config - BCFTools bin command
 4005        bcftools_bin_command = get_bin_command(
 4006            bin="bcftools",
 4007            tool="bcftools",
 4008            bin_type="bin",
 4009            config=config,
 4010            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 4011        )
 4012        if not bcftools_bin_command:
 4013            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 4014            log.error(msg_err)
 4015            raise ValueError(msg_err)
 4016
 4017        # Config - BCFTools databases folders
 4018        databases_folders = set(
 4019            self.get_config()
 4020            .get("folders", {})
 4021            .get("databases", {})
 4022            .get("annotations", ["."])
 4023            + self.get_config()
 4024            .get("folders", {})
 4025            .get("databases", {})
 4026            .get("bcftools", ["."])
 4027        )
 4028        log.debug("Databases annotations: " + str(databases_folders))
 4029
 4030        # Param
 4031        annotations = (
 4032            self.get_param()
 4033            .get("annotation", {})
 4034            .get("bcftools", {})
 4035            .get("annotations", None)
 4036        )
 4037        log.debug("Annotations: " + str(annotations))
 4038
 4039        # Assembly
 4040        assembly = self.get_param().get(
 4041            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 4042        )
 4043
 4044        # Data
 4045        table_variants = self.get_table_variants()
 4046
 4047        # Check if not empty
 4048        log.debug("Check if not empty")
 4049        sql_query_chromosomes = (
 4050            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4051        )
 4052        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 4053        if not sql_query_chromosomes_df["count"][0]:
 4054            log.info(f"VCF empty")
 4055            return
 4056
 4057        # Export in VCF
 4058        log.debug("Create initial file to annotate")
 4059        tmp_vcf = NamedTemporaryFile(
 4060            prefix=self.get_prefix(),
 4061            dir=self.get_tmp_dir(),
 4062            suffix=".vcf.gz",
 4063            delete=False,
 4064        )
 4065        tmp_vcf_name = tmp_vcf.name
 4066
 4067        # VCF header
 4068        vcf_reader = self.get_header()
 4069        log.debug("Initial header: " + str(vcf_reader.infos))
 4070
 4071        # Existing annotations
 4072        for vcf_annotation in self.get_header().infos:
 4073
 4074            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 4075            log.debug(
 4076                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 4077            )
 4078
 4079        if annotations:
 4080
 4081            tmp_ann_vcf_list = []
 4082            commands = []
 4083            tmp_files = []
 4084            err_files = []
 4085
 4086            for annotation in annotations:
 4087                annotation_fields = annotations[annotation]
 4088
 4089                # Annotation Name
 4090                annotation_name = os.path.basename(annotation)
 4091
 4092                if not annotation_fields:
 4093                    annotation_fields = {"INFO": None}
 4094
 4095                log.debug(f"Annotation '{annotation_name}'")
 4096                log.debug(
 4097                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 4098                )
 4099
 4100                # Create Database
 4101                database = Database(
 4102                    database=annotation,
 4103                    databases_folders=databases_folders,
 4104                    assembly=assembly,
 4105                )
 4106
 4107                # Find files
 4108                db_file = database.get_database()
 4109                db_file = full_path(db_file)
 4110                db_hdr_file = database.get_header_file()
 4111                db_hdr_file = full_path(db_hdr_file)
 4112                db_file_type = database.get_format()
 4113                db_tbi_file = f"{db_file}.tbi"
 4114                db_file_compressed = database.is_compressed()
 4115
 4116                # Check if compressed
 4117                if not db_file_compressed:
 4118                    log.error(
 4119                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4120                    )
 4121                    raise ValueError(
 4122                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4123                    )
 4124
 4125                # Check if indexed
 4126                if not os.path.exists(db_tbi_file):
 4127                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
 4128                    raise ValueError(
 4129                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
 4130                    )
 4131
 4132                # Check index - try to create if not exists
 4133                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 4134                    log.error("Annotation failed: database not valid")
 4135                    log.error(f"Annotation annotation file: {db_file}")
 4136                    log.error(f"Annotation annotation header: {db_hdr_file}")
 4137                    log.error(f"Annotation annotation index: {db_tbi_file}")
 4138                    raise ValueError(
 4139                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 4140                    )
 4141                else:
 4142
 4143                    log.debug(
 4144                        f"Annotation '{annotation}' - file: "
 4145                        + str(db_file)
 4146                        + " and "
 4147                        + str(db_hdr_file)
 4148                    )
 4149
 4150                    # Load header as VCF object
 4151                    db_hdr_vcf = Variants(input=db_hdr_file)
 4152                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 4153                    log.debug(
 4154                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
 4155                    )
 4156
 4157                    # For all fields in database
 4158                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 4159                        annotation_fields = {
 4160                            key: key for key in db_hdr_vcf_header_infos
 4161                        }
 4162                        log.debug(
 4163                            "Annotation database header - All annotations added: "
 4164                            + str(annotation_fields)
 4165                        )
 4166
 4167                    # Number of fields
 4168                    nb_annotation_field = 0
 4169                    annotation_list = []
 4170
 4171                    for annotation_field in annotation_fields:
 4172
 4173                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 4174                        annotation_fields_new_name = annotation_fields.get(
 4175                            annotation_field, annotation_field
 4176                        )
 4177                        if not annotation_fields_new_name:
 4178                            annotation_fields_new_name = annotation_field
 4179
 4180                        # Check if field is in DB and if field is not elready in input data
 4181                        if (
 4182                            annotation_field in db_hdr_vcf.get_header().infos
 4183                            and annotation_fields_new_name
 4184                            not in self.get_header().infos
 4185                        ):
 4186
 4187                            log.info(
 4188                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 4189                            )
 4190
 4191                            # Add INFO field to header
 4192                            db_hdr_vcf_header_infos_number = (
 4193                                db_hdr_vcf_header_infos[annotation_field].num or "."
 4194                            )
 4195                            db_hdr_vcf_header_infos_type = (
 4196                                db_hdr_vcf_header_infos[annotation_field].type
 4197                                or "String"
 4198                            )
 4199                            db_hdr_vcf_header_infos_description = (
 4200                                db_hdr_vcf_header_infos[annotation_field].desc
 4201                                or f"{annotation_field} description"
 4202                            )
 4203                            db_hdr_vcf_header_infos_source = (
 4204                                db_hdr_vcf_header_infos[annotation_field].source
 4205                                or "unknown"
 4206                            )
 4207                            db_hdr_vcf_header_infos_version = (
 4208                                db_hdr_vcf_header_infos[annotation_field].version
 4209                                or "unknown"
 4210                            )
 4211
 4212                            vcf_reader.infos[annotation_fields_new_name] = (
 4213                                vcf.parser._Info(
 4214                                    annotation_fields_new_name,
 4215                                    db_hdr_vcf_header_infos_number,
 4216                                    db_hdr_vcf_header_infos_type,
 4217                                    db_hdr_vcf_header_infos_description,
 4218                                    db_hdr_vcf_header_infos_source,
 4219                                    db_hdr_vcf_header_infos_version,
 4220                                    self.code_type_map[db_hdr_vcf_header_infos_type],
 4221                                )
 4222                            )
 4223
 4224                            # annotation_list.append(annotation_field)
 4225                            if annotation_field != annotation_fields_new_name:
 4226                                annotation_list.append(
 4227                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 4228                                )
 4229                            else:
 4230                                annotation_list.append(annotation_field)
 4231
 4232                            nb_annotation_field += 1
 4233
 4234                        else:
 4235
 4236                            if annotation_field not in db_hdr_vcf.get_header().infos:
 4237                                log.warning(
 4238                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
 4239                                )
 4240                            if annotation_fields_new_name in self.get_header().infos:
 4241                                log.warning(
 4242                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 4243                                )
 4244
 4245                    log.info(
 4246                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 4247                    )
 4248
 4249                    annotation_infos = ",".join(annotation_list)
 4250
 4251                    if annotation_infos != "":
 4252
 4253                        # Protect header for bcftools (remove "#CHROM" and variants line)
 4254                        log.debug("Protect Header file - remove #CHROM line if exists")
 4255                        tmp_header_vcf = NamedTemporaryFile(
 4256                            prefix=self.get_prefix(),
 4257                            dir=self.get_tmp_dir(),
 4258                            suffix=".hdr",
 4259                            delete=False,
 4260                        )
 4261                        tmp_header_vcf_name = tmp_header_vcf.name
 4262                        tmp_files.append(tmp_header_vcf_name)
 4263                        # Command
 4264                        if db_hdr_file.endswith(".gz"):
 4265                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4266                        else:
 4267                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4268                        # Run
 4269                        run_parallel_commands([command_extract_header], 1)
 4270
 4271                        # Find chomosomes
 4272                        log.debug("Find chromosomes ")
 4273                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
 4274                        sql_query_chromosomes_df = self.get_query_to_df(
 4275                            sql_query_chromosomes
 4276                        )
 4277                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
 4278
 4279                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
 4280
 4281                        # BED columns in the annotation file
 4282                        if db_file_type in ["bed"]:
 4283                            annotation_infos = "CHROM,POS,POS," + annotation_infos
 4284
 4285                        for chrom in chomosomes_list:
 4286
 4287                            # Create BED on initial VCF
 4288                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
 4289                            tmp_bed = NamedTemporaryFile(
 4290                                prefix=self.get_prefix(),
 4291                                dir=self.get_tmp_dir(),
 4292                                suffix=".bed",
 4293                                delete=False,
 4294                            )
 4295                            tmp_bed_name = tmp_bed.name
 4296                            tmp_files.append(tmp_bed_name)
 4297
 4298                            # Detecte regions
 4299                            log.debug(
 4300                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
 4301                            )
 4302                            window = 1000000
 4303                            sql_query_intervals_for_bed = f"""
 4304                                SELECT  \"#CHROM\",
 4305                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
 4306                                        \"POS\"+{window}
 4307                                FROM {table_variants} as table_variants
 4308                                WHERE table_variants.\"#CHROM\" = '{chrom}'
 4309                            """
 4310                            regions = self.conn.execute(
 4311                                sql_query_intervals_for_bed
 4312                            ).fetchall()
 4313                            merged_regions = merge_regions(regions)
 4314                            log.debug(
 4315                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
 4316                            )
 4317
 4318                            header = ["#CHROM", "START", "END"]
 4319                            with open(tmp_bed_name, "w") as f:
 4320                                # Write the header with tab delimiter
 4321                                f.write("\t".join(header) + "\n")
 4322                                for d in merged_regions:
 4323                                    # Write each data row with tab delimiter
 4324                                    f.write("\t".join(map(str, d)) + "\n")
 4325
 4326                            # Tmp files
 4327                            tmp_annotation_vcf = NamedTemporaryFile(
 4328                                prefix=self.get_prefix(),
 4329                                dir=self.get_tmp_dir(),
 4330                                suffix=".vcf.gz",
 4331                                delete=False,
 4332                            )
 4333                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
 4334                            tmp_files.append(tmp_annotation_vcf_name)
 4335                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
 4336                            tmp_annotation_vcf_name_err = (
 4337                                tmp_annotation_vcf_name + ".err"
 4338                            )
 4339                            err_files.append(tmp_annotation_vcf_name_err)
 4340
 4341                            # Annotate Command
 4342                            log.debug(
 4343                                f"Annotation '{annotation}' - add bcftools command"
 4344                            )
 4345
 4346                            # Command
 4347                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 4348
 4349                            # Add command
 4350                            commands.append(command_annotate)
 4351
 4352            # if some commands
 4353            if commands:
 4354
 4355                # Export VCF file
 4356                self.export_variant_vcf(
 4357                    vcf_file=tmp_vcf_name,
 4358                    remove_info=True,
 4359                    add_samples=False,
 4360                    index=True,
 4361                )
 4362
 4363                # Threads
 4364                # calculate threads for annotated commands
 4365                if commands:
 4366                    threads_bcftools_annotate = round(threads / len(commands))
 4367                else:
 4368                    threads_bcftools_annotate = 1
 4369
 4370                if not threads_bcftools_annotate:
 4371                    threads_bcftools_annotate = 1
 4372
 4373                # Add threads option to bcftools commands
 4374                if threads_bcftools_annotate > 1:
 4375                    commands_threaded = []
 4376                    for command in commands:
 4377                        commands_threaded.append(
 4378                            command.replace(
 4379                                f"{bcftools_bin_command} annotate ",
 4380                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
 4381                            )
 4382                        )
 4383                    commands = commands_threaded
 4384
 4385                # Command annotation multithreading
 4386                log.debug(f"Annotation - Annotation commands: " + str(commands))
 4387                log.info(
 4388                    f"Annotation - Annotation multithreaded in "
 4389                    + str(len(commands))
 4390                    + " commands"
 4391                )
 4392
 4393                run_parallel_commands(commands, threads)
 4394
 4395                # Merge
 4396                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
 4397
 4398                if tmp_ann_vcf_list_cmd:
 4399
 4400                    # Tmp file
 4401                    tmp_annotate_vcf = NamedTemporaryFile(
 4402                        prefix=self.get_prefix(),
 4403                        dir=self.get_tmp_dir(),
 4404                        suffix=".vcf.gz",
 4405                        delete=True,
 4406                    )
 4407                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
 4408                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 4409                    err_files.append(tmp_annotate_vcf_name_err)
 4410
 4411                    # Tmp file remove command
 4412                    tmp_files_remove_command = ""
 4413                    if tmp_files:
 4414                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
 4415
 4416                    # Command merge
 4417                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
 4418                    log.info(
 4419                        f"Annotation - Annotation merging "
 4420                        + str(len(commands))
 4421                        + " annotated files"
 4422                    )
 4423                    log.debug(f"Annotation - merge command: {merge_command}")
 4424                    run_parallel_commands([merge_command], 1)
 4425
 4426                    # Error messages
 4427                    log.info(f"Error/Warning messages:")
 4428                    error_message_command_all = []
 4429                    error_message_command_warning = []
 4430                    error_message_command_err = []
 4431                    for err_file in err_files:
 4432                        with open(err_file, "r") as f:
 4433                            for line in f:
 4434                                message = line.strip()
 4435                                error_message_command_all.append(message)
 4436                                if line.startswith("[W::"):
 4437                                    error_message_command_warning.append(message)
 4438                                if line.startswith("[E::"):
 4439                                    error_message_command_err.append(
 4440                                        f"{err_file}: " + message
 4441                                    )
 4442                    # log info
 4443                    for message in list(
 4444                        set(error_message_command_err + error_message_command_warning)
 4445                    ):
 4446                        log.info(f"   {message}")
 4447                    # debug info
 4448                    for message in list(set(error_message_command_all)):
 4449                        log.debug(f"   {message}")
 4450                    # failed
 4451                    if len(error_message_command_err):
 4452                        log.error("Annotation failed: Error in commands")
 4453                        raise ValueError("Annotation failed: Error in commands")
 4454
 4455                    # Update variants
 4456                    log.info(f"Annotation - Updating...")
 4457                    self.update_from_vcf(tmp_annotate_vcf_name)
 4458
 4459    def annotation_exomiser(self, threads: int = None) -> None:
 4460        """
 4461        This function annotate with Exomiser
 4462
 4463        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
 4464        - "analysis" (dict/file):
 4465            Full analysis dictionnary parameters (see Exomiser docs).
 4466            Either a dict, or a file in JSON or YAML format.
 4467            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
 4468            Default : None
 4469        - "preset" (string):
 4470            Analysis preset (available in config folder).
 4471            Used if no full "analysis" is provided.
 4472            Default: "exome"
 4473        - "phenopacket" (dict/file):
 4474            Samples and phenotipic features parameters (see Exomiser docs).
 4475            Either a dict, or a file in JSON or YAML format.
 4476            Default: None
 4477        - "subject" (dict):
 4478            Sample parameters (see Exomiser docs).
 4479            Example:
 4480                "subject":
 4481                    {
 4482                        "id": "ISDBM322017",
 4483                        "sex": "FEMALE"
 4484                    }
 4485            Default: None
 4486        - "sample" (string):
 4487            Sample name to construct "subject" section:
 4488                "subject":
 4489                    {
 4490                        "id": "<sample>",
 4491                        "sex": "UNKNOWN_SEX"
 4492                    }
 4493            Default: None
 4494        - "phenotypicFeatures" (dict)
 4495            Phenotypic features to construct "subject" section.
 4496            Example:
 4497                "phenotypicFeatures":
 4498                    [
 4499                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
 4500                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
 4501                    ]
 4502        - "hpo" (list)
 4503            List of HPO ids as phenotypic features.
 4504            Example:
 4505                "hpo": ['0001156', '0001363', '0011304', '0010055']
 4506            Default: []
 4507        - "outputOptions" (dict):
 4508            Output options (see Exomiser docs).
 4509            Default:
 4510                "output_options" =
 4511                    {
 4512                        "outputContributingVariantsOnly": False,
 4513                        "numGenes": 0,
 4514                        "outputFormats": ["TSV_VARIANT", "VCF"]
 4515                    }
 4516        - "transcript_source" (string):
 4517            Transcript source (either "refseq", "ucsc", "ensembl")
 4518            Default: "refseq"
 4519        - "exomiser_to_info" (boolean):
 4520            Add exomiser TSV file columns as INFO fields in VCF.
 4521            Default: False
 4522        - "release" (string):
 4523            Exomise database release.
 4524            If not exists, database release will be downloaded (take a while).
 4525            Default: None (provided by application.properties configuration file)
 4526        - "exomiser_application_properties" (file):
 4527            Exomiser configuration file (see Exomiser docs).
 4528            Useful to automatically download databases (especially for specific genome databases).
 4529
 4530        Notes:
 4531        - If no sample in parameters, first sample in VCF will be chosen
 4532        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
 4533
 4534        :param threads: The number of threads to use
 4535        :return: None.
 4536        """
 4537
 4538        # DEBUG
 4539        log.debug("Start annotation with Exomiser databases")
 4540
 4541        # Threads
 4542        if not threads:
 4543            threads = self.get_threads()
 4544        log.debug("Threads: " + str(threads))
 4545
 4546        # Config
 4547        config = self.get_config()
 4548        log.debug("Config: " + str(config))
 4549
 4550        # Config - Folders - Databases
 4551        databases_folders = (
 4552            config.get("folders", {})
 4553            .get("databases", {})
 4554            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
 4555        )
 4556        databases_folders = full_path(databases_folders)
 4557        if not os.path.exists(databases_folders):
 4558            log.error(f"Databases annotations: {databases_folders} NOT found")
 4559        log.debug("Databases annotations: " + str(databases_folders))
 4560
 4561        # Config - Exomiser
 4562        exomiser_bin_command = get_bin_command(
 4563            bin="exomiser-cli*.jar",
 4564            tool="exomiser",
 4565            bin_type="jar",
 4566            config=config,
 4567            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
 4568        )
 4569        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
 4570        if not exomiser_bin_command:
 4571            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
 4572            log.error(msg_err)
 4573            raise ValueError(msg_err)
 4574
 4575        # Param
 4576        param = self.get_param()
 4577        log.debug("Param: " + str(param))
 4578
 4579        # Param - Exomiser
 4580        param_exomiser = param.get("annotation", {}).get("exomiser", {})
 4581        log.debug(f"Param Exomiser: {param_exomiser}")
 4582
 4583        # Param - Assembly
 4584        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4585        log.debug("Assembly: " + str(assembly))
 4586
 4587        # Data
 4588        table_variants = self.get_table_variants()
 4589
 4590        # Check if not empty
 4591        log.debug("Check if not empty")
 4592        sql_query_chromosomes = (
 4593            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4594        )
 4595        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4596            log.info(f"VCF empty")
 4597            return False
 4598
 4599        # VCF header
 4600        vcf_reader = self.get_header()
 4601        log.debug("Initial header: " + str(vcf_reader.infos))
 4602
 4603        # Samples
 4604        samples = self.get_header_sample_list()
 4605        if not samples:
 4606            log.error("No Samples in VCF")
 4607            return False
 4608        log.debug(f"Samples: {samples}")
 4609
 4610        # Memory limit
 4611        memory_limit = self.get_memory("8G")
 4612        log.debug(f"memory_limit: {memory_limit}")
 4613
 4614        # Exomiser java options
 4615        exomiser_java_options = (
 4616            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4617        )
 4618        log.debug(f"Exomiser java options: {exomiser_java_options}")
 4619
 4620        # Download Exomiser (if not exists)
 4621        exomiser_release = param_exomiser.get("release", None)
 4622        exomiser_application_properties = param_exomiser.get(
 4623            "exomiser_application_properties", None
 4624        )
 4625        databases_download_exomiser(
 4626            assemblies=[assembly],
 4627            exomiser_folder=databases_folders,
 4628            exomiser_release=exomiser_release,
 4629            exomiser_phenotype_release=exomiser_release,
 4630            exomiser_application_properties=exomiser_application_properties,
 4631        )
 4632
 4633        # Force annotation
 4634        force_update_annotation = True
 4635
 4636        if "Exomiser" not in self.get_header().infos or force_update_annotation:
 4637            log.debug("Start annotation Exomiser")
 4638
 4639            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 4640
 4641                # tmp_dir = "/tmp/exomiser"
 4642
 4643                ### ANALYSIS ###
 4644                ################
 4645
 4646                # Create analysis.json through analysis dict
 4647                # either analysis in param or by default
 4648                # depending on preset exome/genome)
 4649
 4650                # Init analysis dict
 4651                param_exomiser_analysis_dict = {}
 4652
 4653                # analysis from param
 4654                param_exomiser_analysis = param_exomiser.get("analysis", {})
 4655                param_exomiser_analysis = full_path(param_exomiser_analysis)
 4656
 4657                # If analysis in param -> load anlaysis json
 4658                if param_exomiser_analysis:
 4659
 4660                    # If param analysis is a file and exists
 4661                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
 4662                        param_exomiser_analysis
 4663                    ):
 4664                        # Load analysis file into analysis dict (either yaml or json)
 4665                        with open(param_exomiser_analysis) as json_file:
 4666                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
 4667
 4668                    # If param analysis is a dict
 4669                    elif isinstance(param_exomiser_analysis, dict):
 4670                        # Load analysis dict into analysis dict (either yaml or json)
 4671                        param_exomiser_analysis_dict = param_exomiser_analysis
 4672
 4673                    # Error analysis type
 4674                    else:
 4675                        log.error(f"Analysis type unknown. Check param file.")
 4676                        raise ValueError(f"Analysis type unknown. Check param file.")
 4677
 4678                # Case no input analysis config file/dict
 4679                # Use preset (exome/genome) to open default config file
 4680                if not param_exomiser_analysis_dict:
 4681
 4682                    # default preset
 4683                    default_preset = "exome"
 4684
 4685                    # Get param preset or default preset
 4686                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
 4687
 4688                    # Try to find if preset is a file
 4689                    if os.path.exists(param_exomiser_preset):
 4690                        # Preset file is provided in full path
 4691                        param_exomiser_analysis_default_config_file = (
 4692                            param_exomiser_preset
 4693                        )
 4694                    # elif os.path.exists(full_path(param_exomiser_preset)):
 4695                    #     # Preset file is provided in full path
 4696                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
 4697                    elif os.path.exists(
 4698                        os.path.join(folder_config, param_exomiser_preset)
 4699                    ):
 4700                        # Preset file is provided a basename in config folder (can be a path with subfolders)
 4701                        param_exomiser_analysis_default_config_file = os.path.join(
 4702                            folder_config, param_exomiser_preset
 4703                        )
 4704                    else:
 4705                        # Construct preset file
 4706                        param_exomiser_analysis_default_config_file = os.path.join(
 4707                            folder_config,
 4708                            f"preset-{param_exomiser_preset}-analysis.json",
 4709                        )
 4710
 4711                    # If preset file exists
 4712                    param_exomiser_analysis_default_config_file = full_path(
 4713                        param_exomiser_analysis_default_config_file
 4714                    )
 4715                    if os.path.exists(param_exomiser_analysis_default_config_file):
 4716                        # Load prest file into analysis dict (either yaml or json)
 4717                        with open(
 4718                            param_exomiser_analysis_default_config_file
 4719                        ) as json_file:
 4720                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
 4721                                json_file
 4722                            )
 4723
 4724                    # Error preset file
 4725                    else:
 4726                        log.error(
 4727                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4728                        )
 4729                        raise ValueError(
 4730                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4731                        )
 4732
 4733                # If no analysis dict created
 4734                if not param_exomiser_analysis_dict:
 4735                    log.error(f"No analysis config")
 4736                    raise ValueError(f"No analysis config")
 4737
 4738                # Log
 4739                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4740
 4741                ### PHENOPACKET ###
 4742                ###################
 4743
 4744                # If no PhenoPacket in analysis dict -> check in param
 4745                if "phenopacket" not in param_exomiser_analysis_dict:
 4746
 4747                    # If PhenoPacket in param -> load anlaysis json
 4748                    if param_exomiser.get("phenopacket", None):
 4749
 4750                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
 4751                        param_exomiser_phenopacket = full_path(
 4752                            param_exomiser_phenopacket
 4753                        )
 4754
 4755                        # If param phenopacket is a file and exists
 4756                        if isinstance(
 4757                            param_exomiser_phenopacket, str
 4758                        ) and os.path.exists(param_exomiser_phenopacket):
 4759                            # Load phenopacket file into analysis dict (either yaml or json)
 4760                            with open(param_exomiser_phenopacket) as json_file:
 4761                                param_exomiser_analysis_dict["phenopacket"] = (
 4762                                    yaml.safe_load(json_file)
 4763                                )
 4764
 4765                        # If param phenopacket is a dict
 4766                        elif isinstance(param_exomiser_phenopacket, dict):
 4767                            # Load phenopacket dict into analysis dict (either yaml or json)
 4768                            param_exomiser_analysis_dict["phenopacket"] = (
 4769                                param_exomiser_phenopacket
 4770                            )
 4771
 4772                        # Error phenopacket type
 4773                        else:
 4774                            log.error(f"Phenopacket type unknown. Check param file.")
 4775                            raise ValueError(
 4776                                f"Phenopacket type unknown. Check param file."
 4777                            )
 4778
 4779                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
 4780                if "phenopacket" not in param_exomiser_analysis_dict:
 4781
 4782                    # Init PhenoPacket
 4783                    param_exomiser_analysis_dict["phenopacket"] = {
 4784                        "id": "analysis",
 4785                        "proband": {},
 4786                    }
 4787
 4788                    ### Add subject ###
 4789
 4790                    # If subject exists
 4791                    param_exomiser_subject = param_exomiser.get("subject", {})
 4792
 4793                    # If subject not exists -> found sample ID
 4794                    if not param_exomiser_subject:
 4795
 4796                        # Found sample ID in param
 4797                        sample = param_exomiser.get("sample", None)
 4798
 4799                        # Find sample ID (first sample)
 4800                        if not sample:
 4801                            sample_list = self.get_header_sample_list()
 4802                            if len(sample_list) > 0:
 4803                                sample = sample_list[0]
 4804                            else:
 4805                                log.error(f"No sample found")
 4806                                raise ValueError(f"No sample found")
 4807
 4808                        # Create subject
 4809                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
 4810
 4811                    # Add to dict
 4812                    param_exomiser_analysis_dict["phenopacket"][
 4813                        "subject"
 4814                    ] = param_exomiser_subject
 4815
 4816                    ### Add "phenotypicFeatures" ###
 4817
 4818                    # If phenotypicFeatures exists
 4819                    param_exomiser_phenotypicfeatures = param_exomiser.get(
 4820                        "phenotypicFeatures", []
 4821                    )
 4822
 4823                    # If phenotypicFeatures not exists -> Try to infer from hpo list
 4824                    if not param_exomiser_phenotypicfeatures:
 4825
 4826                        # Found HPO in param
 4827                        param_exomiser_hpo = param_exomiser.get("hpo", [])
 4828
 4829                        # Split HPO if list in string format separated by comma
 4830                        if isinstance(param_exomiser_hpo, str):
 4831                            param_exomiser_hpo = param_exomiser_hpo.split(",")
 4832
 4833                        # Create HPO list
 4834                        for hpo in param_exomiser_hpo:
 4835                            hpo_clean = re.sub("[^0-9]", "", hpo)
 4836                            param_exomiser_phenotypicfeatures.append(
 4837                                {
 4838                                    "type": {
 4839                                        "id": f"HP:{hpo_clean}",
 4840                                        "label": f"HP:{hpo_clean}",
 4841                                    }
 4842                                }
 4843                            )
 4844
 4845                    # Add to dict
 4846                    param_exomiser_analysis_dict["phenopacket"][
 4847                        "phenotypicFeatures"
 4848                    ] = param_exomiser_phenotypicfeatures
 4849
 4850                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
 4851                    if not param_exomiser_phenotypicfeatures:
 4852                        for step in param_exomiser_analysis_dict.get(
 4853                            "analysis", {}
 4854                        ).get("steps", []):
 4855                            if "hiPhivePrioritiser" in step:
 4856                                param_exomiser_analysis_dict.get("analysis", {}).get(
 4857                                    "steps", []
 4858                                ).remove(step)
 4859
 4860                ### Add Input File ###
 4861
 4862                # Initial file name and htsFiles
 4863                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
 4864                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
 4865                    {
 4866                        "uri": tmp_vcf_name,
 4867                        "htsFormat": "VCF",
 4868                        "genomeAssembly": assembly,
 4869                    }
 4870                ]
 4871
 4872                ### Add metaData ###
 4873
 4874                # If metaData not in analysis dict
 4875                if "metaData" not in param_exomiser_analysis_dict:
 4876                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
 4877                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
 4878                        "createdBy": "howard",
 4879                        "phenopacketSchemaVersion": 1,
 4880                    }
 4881
 4882                ### OutputOptions ###
 4883
 4884                # Init output result folder
 4885                output_results = os.path.join(tmp_dir, "results")
 4886
 4887                # If no outputOptions in analysis dict
 4888                if "outputOptions" not in param_exomiser_analysis_dict:
 4889
 4890                    # default output formats
 4891                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
 4892
 4893                    # Get outputOptions in param
 4894                    output_options = param_exomiser.get("outputOptions", None)
 4895
 4896                    # If no output_options in param -> check
 4897                    if not output_options:
 4898                        output_options = {
 4899                            "outputContributingVariantsOnly": False,
 4900                            "numGenes": 0,
 4901                            "outputFormats": defaut_output_formats,
 4902                        }
 4903
 4904                    # Replace outputDirectory in output options
 4905                    output_options["outputDirectory"] = output_results
 4906                    output_options["outputFileName"] = "howard"
 4907
 4908                    # Add outputOptions in analysis dict
 4909                    param_exomiser_analysis_dict["outputOptions"] = output_options
 4910
 4911                else:
 4912
 4913                    # Replace output_results and output format (if exists in param)
 4914                    param_exomiser_analysis_dict["outputOptions"][
 4915                        "outputDirectory"
 4916                    ] = output_results
 4917                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
 4918                        list(
 4919                            set(
 4920                                param_exomiser_analysis_dict.get(
 4921                                    "outputOptions", {}
 4922                                ).get("outputFormats", [])
 4923                                + ["TSV_VARIANT", "VCF"]
 4924                            )
 4925                        )
 4926                    )
 4927
 4928                # log
 4929                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4930
 4931                ### ANALYSIS FILE ###
 4932                #####################
 4933
 4934                ### Full JSON analysis config file ###
 4935
 4936                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
 4937                with open(exomiser_analysis, "w") as fp:
 4938                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
 4939
 4940                ### SPLIT analysis and sample config files
 4941
 4942                # Splitted analysis dict
 4943                param_exomiser_analysis_dict_for_split = (
 4944                    param_exomiser_analysis_dict.copy()
 4945                )
 4946
 4947                # Phenopacket JSON file
 4948                exomiser_analysis_phenopacket = os.path.join(
 4949                    tmp_dir, "analysis_phenopacket.json"
 4950                )
 4951                with open(exomiser_analysis_phenopacket, "w") as fp:
 4952                    json.dump(
 4953                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
 4954                        fp,
 4955                        indent=4,
 4956                    )
 4957
 4958                # Analysis JSON file without Phenopacket parameters
 4959                param_exomiser_analysis_dict_for_split.pop("phenopacket")
 4960                exomiser_analysis_analysis = os.path.join(
 4961                    tmp_dir, "analysis_analysis.json"
 4962                )
 4963                with open(exomiser_analysis_analysis, "w") as fp:
 4964                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
 4965
 4966                ### INITAL VCF file ###
 4967                #######################
 4968
 4969                ### Create list of samples to use and include inti initial VCF file ####
 4970
 4971                # Subject (main sample)
 4972                # Get sample ID in analysis dict
 4973                sample_subject = (
 4974                    param_exomiser_analysis_dict.get("phenopacket", {})
 4975                    .get("subject", {})
 4976                    .get("id", None)
 4977                )
 4978                sample_proband = (
 4979                    param_exomiser_analysis_dict.get("phenopacket", {})
 4980                    .get("proband", {})
 4981                    .get("subject", {})
 4982                    .get("id", None)
 4983                )
 4984                sample = []
 4985                if sample_subject:
 4986                    sample.append(sample_subject)
 4987                if sample_proband:
 4988                    sample.append(sample_proband)
 4989
 4990                # Get sample ID within Pedigree
 4991                pedigree_persons_list = (
 4992                    param_exomiser_analysis_dict.get("phenopacket", {})
 4993                    .get("pedigree", {})
 4994                    .get("persons", {})
 4995                )
 4996
 4997                # Create list with all sample ID in pedigree (if exists)
 4998                pedigree_persons = []
 4999                for person in pedigree_persons_list:
 5000                    pedigree_persons.append(person.get("individualId"))
 5001
 5002                # Concat subject sample ID and samples ID in pedigreesamples
 5003                samples = list(set(sample + pedigree_persons))
 5004
 5005                # Check if sample list is not empty
 5006                if not samples:
 5007                    log.error(f"No samples found")
 5008                    raise ValueError(f"No samples found")
 5009
 5010                # Create VCF with sample (either sample in param or first one by default)
 5011                # Export VCF file
 5012                self.export_variant_vcf(
 5013                    vcf_file=tmp_vcf_name,
 5014                    remove_info=True,
 5015                    add_samples=True,
 5016                    list_samples=samples,
 5017                    index=False,
 5018                )
 5019
 5020                ### Execute Exomiser ###
 5021                ########################
 5022
 5023                # Init command
 5024                exomiser_command = ""
 5025
 5026                # Command exomiser options
 5027                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
 5028
 5029                # Release
 5030                exomiser_release = param_exomiser.get("release", None)
 5031                if exomiser_release:
 5032                    # phenotype data version
 5033                    exomiser_options += (
 5034                        f" --exomiser.phenotype.data-version={exomiser_release} "
 5035                    )
 5036                    # data version
 5037                    exomiser_options += (
 5038                        f" --exomiser.{assembly}.data-version={exomiser_release} "
 5039                    )
 5040                    # variant white list
 5041                    variant_white_list_file = (
 5042                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
 5043                    )
 5044                    if os.path.exists(
 5045                        os.path.join(
 5046                            databases_folders, assembly, variant_white_list_file
 5047                        )
 5048                    ):
 5049                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
 5050
 5051                # transcript_source
 5052                transcript_source = param_exomiser.get(
 5053                    "transcript_source", None
 5054                )  # ucsc, refseq, ensembl
 5055                if transcript_source:
 5056                    exomiser_options += (
 5057                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
 5058                    )
 5059
 5060                # If analysis contain proband param
 5061                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
 5062                    "proband", {}
 5063                ):
 5064                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
 5065
 5066                # If no proband (usually uniq sample)
 5067                else:
 5068                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
 5069
 5070                # Log
 5071                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
 5072
 5073                # Run command
 5074                result = subprocess.call(
 5075                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
 5076                )
 5077                if result:
 5078                    log.error("Exomiser command failed")
 5079                    raise ValueError("Exomiser command failed")
 5080
 5081                ### RESULTS ###
 5082                ###############
 5083
 5084                ### Annotate with TSV fields ###
 5085
 5086                # Init result tsv file
 5087                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
 5088
 5089                # Init result tsv file
 5090                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
 5091
 5092                # Parse TSV file and explode columns in INFO field
 5093                if exomiser_to_info and os.path.exists(output_results_tsv):
 5094
 5095                    # Log
 5096                    log.debug("Exomiser columns to VCF INFO field")
 5097
 5098                    # Retrieve columns and types
 5099                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
 5100                    output_results_tsv_df = self.get_query_to_df(query)
 5101                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
 5102
 5103                    # Init concat fields for update
 5104                    sql_query_update_concat_fields = []
 5105
 5106                    # Fields to avoid
 5107                    fields_to_avoid = [
 5108                        "CONTIG",
 5109                        "START",
 5110                        "END",
 5111                        "REF",
 5112                        "ALT",
 5113                        "QUAL",
 5114                        "FILTER",
 5115                        "GENOTYPE",
 5116                    ]
 5117
 5118                    # List all columns to add into header
 5119                    for header_column in output_results_tsv_columns:
 5120
 5121                        # If header column is enable
 5122                        if header_column not in fields_to_avoid:
 5123
 5124                            # Header info type
 5125                            header_info_type = "String"
 5126                            header_column_df = output_results_tsv_df[header_column]
 5127                            header_column_df_dtype = header_column_df.dtype
 5128                            if header_column_df_dtype == object:
 5129                                if (
 5130                                    pd.to_numeric(header_column_df, errors="coerce")
 5131                                    .notnull()
 5132                                    .all()
 5133                                ):
 5134                                    header_info_type = "Float"
 5135                            else:
 5136                                header_info_type = "Integer"
 5137
 5138                            # Header info
 5139                            characters_to_validate = ["-"]
 5140                            pattern = "[" + "".join(characters_to_validate) + "]"
 5141                            header_info_name = re.sub(
 5142                                pattern,
 5143                                "_",
 5144                                f"Exomiser_{header_column}".replace("#", ""),
 5145                            )
 5146                            header_info_number = "."
 5147                            header_info_description = (
 5148                                f"Exomiser {header_column} annotation"
 5149                            )
 5150                            header_info_source = "Exomiser"
 5151                            header_info_version = "unknown"
 5152                            header_info_code = CODE_TYPE_MAP[header_info_type]
 5153                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
 5154                                header_info_name,
 5155                                header_info_number,
 5156                                header_info_type,
 5157                                header_info_description,
 5158                                header_info_source,
 5159                                header_info_version,
 5160                                header_info_code,
 5161                            )
 5162
 5163                            # Add field to add for update to concat fields
 5164                            sql_query_update_concat_fields.append(
 5165                                f"""
 5166                                CASE
 5167                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
 5168                                    THEN concat(
 5169                                        '{header_info_name}=',
 5170                                        table_parquet."{header_column}",
 5171                                        ';'
 5172                                        )
 5173
 5174                                    ELSE ''
 5175                                END
 5176                            """
 5177                            )
 5178
 5179                    # Update query
 5180                    sql_query_update = f"""
 5181                        UPDATE {table_variants} as table_variants
 5182                            SET INFO = concat(
 5183                                            CASE
 5184                                                WHEN INFO NOT IN ('', '.')
 5185                                                THEN INFO
 5186                                                ELSE ''
 5187                                            END,
 5188                                            CASE
 5189                                                WHEN table_variants.INFO NOT IN ('','.')
 5190                                                THEN ';'
 5191                                                ELSE ''
 5192                                            END,
 5193                                            (
 5194                                            SELECT 
 5195                                                concat(
 5196                                                    {",".join(sql_query_update_concat_fields)}
 5197                                                )
 5198                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
 5199                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
 5200                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
 5201                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 5202                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 5203                                            )
 5204                                        )
 5205                            ;
 5206                        """
 5207
 5208                    # Update
 5209                    self.conn.execute(sql_query_update)
 5210
 5211                ### Annotate with VCF INFO field ###
 5212
 5213                # Init result VCF file
 5214                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
 5215
 5216                # If VCF exists
 5217                if os.path.exists(output_results_vcf):
 5218
 5219                    # Log
 5220                    log.debug("Exomiser result VCF update variants")
 5221
 5222                    # Find Exomiser INFO field annotation in header
 5223                    with gzip.open(output_results_vcf, "rt") as f:
 5224                        header_list = self.read_vcf_header(f)
 5225                    exomiser_vcf_header = vcf.Reader(
 5226                        io.StringIO("\n".join(header_list))
 5227                    )
 5228
 5229                    # Add annotation INFO field to header
 5230                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
 5231
 5232                    # Update variants with VCF
 5233                    self.update_from_vcf(output_results_vcf)
 5234
 5235        return True
 5236
 5237    def annotation_snpeff(self, threads: int = None) -> None:
 5238        """
 5239        This function annotate with snpEff
 5240
 5241        :param threads: The number of threads to use
 5242        :return: the value of the variable "return_value".
 5243        """
 5244
 5245        # DEBUG
 5246        log.debug("Start annotation with snpeff databases")
 5247
 5248        # Threads
 5249        if not threads:
 5250            threads = self.get_threads()
 5251        log.debug("Threads: " + str(threads))
 5252
 5253        # DEBUG
 5254        delete_tmp = True
 5255        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5256            delete_tmp = False
 5257            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5258
 5259        # Config
 5260        config = self.get_config()
 5261        log.debug("Config: " + str(config))
 5262
 5263        # Config - Folders - Databases
 5264        databases_folders = (
 5265            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
 5266        )
 5267        log.debug("Databases annotations: " + str(databases_folders))
 5268
 5269        # Config - snpEff bin command
 5270        snpeff_bin_command = get_bin_command(
 5271            bin="snpEff.jar",
 5272            tool="snpeff",
 5273            bin_type="jar",
 5274            config=config,
 5275            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 5276        )
 5277        if not snpeff_bin_command:
 5278            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
 5279            log.error(msg_err)
 5280            raise ValueError(msg_err)
 5281
 5282        # Config - snpEff databases
 5283        snpeff_databases = (
 5284            config.get("folders", {})
 5285            .get("databases", {})
 5286            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
 5287        )
 5288        snpeff_databases = full_path(snpeff_databases)
 5289        if snpeff_databases is not None and snpeff_databases != "":
 5290            log.debug(f"Create snpEff databases folder")
 5291            if not os.path.exists(snpeff_databases):
 5292                os.makedirs(snpeff_databases)
 5293
 5294        # Param
 5295        param = self.get_param()
 5296        log.debug("Param: " + str(param))
 5297
 5298        # Param
 5299        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
 5300        log.debug("Options: " + str(options))
 5301
 5302        # Param - Assembly
 5303        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5304
 5305        # Param - Options
 5306        snpeff_options = (
 5307            param.get("annotation", {}).get("snpeff", {}).get("options", "")
 5308        )
 5309        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
 5310        snpeff_csvstats = (
 5311            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
 5312        )
 5313        if snpeff_stats:
 5314            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
 5315            snpeff_stats = full_path(snpeff_stats)
 5316            snpeff_options += f" -stats {snpeff_stats}"
 5317        if snpeff_csvstats:
 5318            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
 5319            snpeff_csvstats = full_path(snpeff_csvstats)
 5320            snpeff_options += f" -csvStats {snpeff_csvstats}"
 5321
 5322        # Data
 5323        table_variants = self.get_table_variants()
 5324
 5325        # Check if not empty
 5326        log.debug("Check if not empty")
 5327        sql_query_chromosomes = (
 5328            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5329        )
 5330        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
 5331        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 5332            log.info(f"VCF empty")
 5333            return
 5334
 5335        # Export in VCF
 5336        log.debug("Create initial file to annotate")
 5337        tmp_vcf = NamedTemporaryFile(
 5338            prefix=self.get_prefix(),
 5339            dir=self.get_tmp_dir(),
 5340            suffix=".vcf.gz",
 5341            delete=True,
 5342        )
 5343        tmp_vcf_name = tmp_vcf.name
 5344
 5345        # VCF header
 5346        vcf_reader = self.get_header()
 5347        log.debug("Initial header: " + str(vcf_reader.infos))
 5348
 5349        # Existing annotations
 5350        for vcf_annotation in self.get_header().infos:
 5351
 5352            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5353            log.debug(
 5354                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5355            )
 5356
 5357        # Memory limit
 5358        # if config.get("memory", None):
 5359        #     memory_limit = config.get("memory", "8G")
 5360        # else:
 5361        #     memory_limit = "8G"
 5362        memory_limit = self.get_memory("8G")
 5363        log.debug(f"memory_limit: {memory_limit}")
 5364
 5365        # snpEff java options
 5366        snpeff_java_options = (
 5367            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 5368        )
 5369        log.debug(f"Exomiser java options: {snpeff_java_options}")
 5370
 5371        force_update_annotation = True
 5372
 5373        if "ANN" not in self.get_header().infos or force_update_annotation:
 5374
 5375            # Check snpEff database
 5376            log.debug(f"Check snpEff databases {[assembly]}")
 5377            databases_download_snpeff(
 5378                folder=snpeff_databases, assemblies=[assembly], config=config
 5379            )
 5380
 5381            # Export VCF file
 5382            self.export_variant_vcf(
 5383                vcf_file=tmp_vcf_name,
 5384                remove_info=True,
 5385                add_samples=False,
 5386                index=True,
 5387            )
 5388
 5389            # Tmp file
 5390            err_files = []
 5391            tmp_annotate_vcf = NamedTemporaryFile(
 5392                prefix=self.get_prefix(),
 5393                dir=self.get_tmp_dir(),
 5394                suffix=".vcf",
 5395                delete=False,
 5396            )
 5397            tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5398            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5399            err_files.append(tmp_annotate_vcf_name_err)
 5400
 5401            # Command
 5402            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
 5403            log.debug(f"Annotation - snpEff command: {snpeff_command}")
 5404            run_parallel_commands([snpeff_command], 1)
 5405
 5406            # Error messages
 5407            log.info(f"Error/Warning messages:")
 5408            error_message_command_all = []
 5409            error_message_command_warning = []
 5410            error_message_command_err = []
 5411            for err_file in err_files:
 5412                with open(err_file, "r") as f:
 5413                    for line in f:
 5414                        message = line.strip()
 5415                        error_message_command_all.append(message)
 5416                        if line.startswith("[W::"):
 5417                            error_message_command_warning.append(message)
 5418                        if line.startswith("[E::"):
 5419                            error_message_command_err.append(f"{err_file}: " + message)
 5420            # log info
 5421            for message in list(
 5422                set(error_message_command_err + error_message_command_warning)
 5423            ):
 5424                log.info(f"   {message}")
 5425            # debug info
 5426            for message in list(set(error_message_command_all)):
 5427                log.debug(f"   {message}")
 5428            # failed
 5429            if len(error_message_command_err):
 5430                log.error("Annotation failed: Error in commands")
 5431                raise ValueError("Annotation failed: Error in commands")
 5432
 5433            # Find annotation in header
 5434            with open(tmp_annotate_vcf_name, "rt") as f:
 5435                header_list = self.read_vcf_header(f)
 5436            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5437
 5438            for ann in annovar_vcf_header.infos:
 5439                if ann not in self.get_header().infos:
 5440                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5441
 5442            # Update variants
 5443            log.info(f"Annotation - Updating...")
 5444            self.update_from_vcf(tmp_annotate_vcf_name)
 5445
 5446        else:
 5447            if "ANN" in self.get_header().infos:
 5448                log.debug(f"Existing snpEff annotations in VCF")
 5449            if force_update_annotation:
 5450                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
 5451
 5452    def annotation_annovar(self, threads: int = None) -> None:
 5453        """
 5454        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
 5455        annotations
 5456
 5457        :param threads: number of threads to use
 5458        :return: the value of the variable "return_value".
 5459        """
 5460
 5461        # DEBUG
 5462        log.debug("Start annotation with Annovar databases")
 5463
 5464        # Threads
 5465        if not threads:
 5466            threads = self.get_threads()
 5467        log.debug("Threads: " + str(threads))
 5468
 5469        # Tmp en Err files
 5470        tmp_files = []
 5471        err_files = []
 5472
 5473        # DEBUG
 5474        delete_tmp = True
 5475        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5476            delete_tmp = False
 5477            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5478
 5479        # Config
 5480        config = self.get_config()
 5481        log.debug("Config: " + str(config))
 5482
 5483        # Config - Folders - Databases
 5484        databases_folders = (
 5485            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
 5486        )
 5487        log.debug("Databases annotations: " + str(databases_folders))
 5488
 5489        # Config - annovar bin command
 5490        annovar_bin_command = get_bin_command(
 5491            bin="table_annovar.pl",
 5492            tool="annovar",
 5493            bin_type="perl",
 5494            config=config,
 5495            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
 5496        )
 5497        if not annovar_bin_command:
 5498            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
 5499            log.error(msg_err)
 5500            raise ValueError(msg_err)
 5501
 5502        # Config - BCFTools bin command
 5503        bcftools_bin_command = get_bin_command(
 5504            bin="bcftools",
 5505            tool="bcftools",
 5506            bin_type="bin",
 5507            config=config,
 5508            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 5509        )
 5510        if not bcftools_bin_command:
 5511            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 5512            log.error(msg_err)
 5513            raise ValueError(msg_err)
 5514
 5515        # Config - annovar databases
 5516        annovar_databases = (
 5517            config.get("folders", {})
 5518            .get("databases", {})
 5519            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
 5520        )
 5521        if annovar_databases is not None:
 5522            if isinstance(annovar_databases, list):
 5523                annovar_databases = full_path(annovar_databases[0])
 5524                log.warning(f"Annovar databases folder '{annovar_databases}' selected")
 5525            annovar_databases = full_path(annovar_databases)
 5526            if not os.path.exists(annovar_databases):
 5527                log.info(f"Annovar databases folder '{annovar_databases}' created")
 5528                Path(annovar_databases).mkdir(parents=True, exist_ok=True)
 5529        else:
 5530            msg_err = f"Annovar databases configuration failed"
 5531            log.error(msg_err)
 5532            raise ValueError(msg_err)
 5533
 5534        # Param
 5535        param = self.get_param()
 5536        log.debug("Param: " + str(param))
 5537
 5538        # Param - options
 5539        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
 5540        log.debug("Options: " + str(options))
 5541
 5542        # Param - annotations
 5543        annotations = (
 5544            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
 5545        )
 5546        log.debug("Annotations: " + str(annotations))
 5547
 5548        # Param - Assembly
 5549        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5550
 5551        # Annovar database assembly
 5552        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
 5553        if annovar_databases_assembly != "" and not os.path.exists(
 5554            annovar_databases_assembly
 5555        ):
 5556            os.makedirs(annovar_databases_assembly)
 5557
 5558        # Data
 5559        table_variants = self.get_table_variants()
 5560
 5561        # Check if not empty
 5562        log.debug("Check if not empty")
 5563        sql_query_chromosomes = (
 5564            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5565        )
 5566        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 5567        if not sql_query_chromosomes_df["count"][0]:
 5568            log.info(f"VCF empty")
 5569            return
 5570
 5571        # VCF header
 5572        vcf_reader = self.get_header()
 5573        log.debug("Initial header: " + str(vcf_reader.infos))
 5574
 5575        # Existing annotations
 5576        for vcf_annotation in self.get_header().infos:
 5577
 5578            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5579            log.debug(
 5580                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5581            )
 5582
 5583        force_update_annotation = True
 5584
 5585        if annotations:
 5586
 5587            commands = []
 5588            tmp_annotates_vcf_name_list = []
 5589
 5590            # Export in VCF
 5591            log.debug("Create initial file to annotate")
 5592            tmp_vcf = NamedTemporaryFile(
 5593                prefix=self.get_prefix(),
 5594                dir=self.get_tmp_dir(),
 5595                suffix=".vcf.gz",
 5596                delete=False,
 5597            )
 5598            tmp_vcf_name = tmp_vcf.name
 5599            tmp_files.append(tmp_vcf_name)
 5600            tmp_files.append(tmp_vcf_name + ".tbi")
 5601
 5602            # Export VCF file
 5603            self.export_variant_vcf(
 5604                vcf_file=tmp_vcf_name,
 5605                remove_info=".",
 5606                add_samples=False,
 5607                index=True,
 5608            )
 5609
 5610            # Create file for field rename
 5611            log.debug("Create file for field rename")
 5612            tmp_rename = NamedTemporaryFile(
 5613                prefix=self.get_prefix(),
 5614                dir=self.get_tmp_dir(),
 5615                suffix=".rename",
 5616                delete=False,
 5617            )
 5618            tmp_rename_name = tmp_rename.name
 5619            tmp_files.append(tmp_rename_name)
 5620
 5621            # Check Annovar database
 5622            log.debug(
 5623                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
 5624            )
 5625            databases_download_annovar(
 5626                folder=annovar_databases,
 5627                files=list(annotations.keys()),
 5628                assemblies=[assembly],
 5629            )
 5630
 5631            for annotation in annotations:
 5632                annotation_fields = annotations[annotation]
 5633
 5634                if not annotation_fields:
 5635                    annotation_fields = {"INFO": None}
 5636
 5637                log.info(f"Annotations Annovar - database '{annotation}'")
 5638                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
 5639
 5640                # Tmp file for annovar
 5641                err_files = []
 5642                tmp_annotate_vcf_directory = TemporaryDirectory(
 5643                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
 5644                )
 5645                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
 5646                tmp_annotate_vcf_name_annovar = (
 5647                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
 5648                )
 5649                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
 5650                err_files.append(tmp_annotate_vcf_name_err)
 5651                tmp_files.append(tmp_annotate_vcf_name_err)
 5652
 5653                # Tmp file final vcf annotated by annovar
 5654                tmp_annotate_vcf = NamedTemporaryFile(
 5655                    prefix=self.get_prefix(),
 5656                    dir=self.get_tmp_dir(),
 5657                    suffix=".vcf.gz",
 5658                    delete=False,
 5659                )
 5660                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5661                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
 5662                tmp_files.append(tmp_annotate_vcf_name)
 5663                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
 5664
 5665                # Number of fields
 5666                annotation_list = []
 5667                annotation_renamed_list = []
 5668
 5669                for annotation_field in annotation_fields:
 5670
 5671                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 5672                    annotation_fields_new_name = annotation_fields.get(
 5673                        annotation_field, annotation_field
 5674                    )
 5675                    if not annotation_fields_new_name:
 5676                        annotation_fields_new_name = annotation_field
 5677
 5678                    if (
 5679                        force_update_annotation
 5680                        or annotation_fields_new_name not in self.get_header().infos
 5681                    ):
 5682                        annotation_list.append(annotation_field)
 5683                        annotation_renamed_list.append(annotation_fields_new_name)
 5684                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
 5685                        log.warning(
 5686                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 5687                        )
 5688
 5689                    # Add rename info
 5690                    run_parallel_commands(
 5691                        [
 5692                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
 5693                        ],
 5694                        1,
 5695                    )
 5696
 5697                # log.debug("fields_to_removed: " + str(fields_to_removed))
 5698                log.debug("annotation_list: " + str(annotation_list))
 5699
 5700                # protocol
 5701                protocol = annotation
 5702
 5703                # argument
 5704                argument = ""
 5705
 5706                # operation
 5707                operation = "f"
 5708                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
 5709                    "ensGene"
 5710                ):
 5711                    operation = "g"
 5712                    if options.get("genebase", None):
 5713                        argument = f"""'{options.get("genebase","")}'"""
 5714                elif annotation in ["cytoBand"]:
 5715                    operation = "r"
 5716
 5717                # argument option
 5718                argument_option = ""
 5719                if argument != "":
 5720                    argument_option = " --argument " + argument
 5721
 5722                # command options
 5723                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
 5724                for option in options:
 5725                    if option not in ["genebase"]:
 5726                        command_options += f""" --{option}={options[option]}"""
 5727
 5728                # Command
 5729
 5730                # Command - Annovar
 5731                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
 5732                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
 5733
 5734                # Command - start pipe
 5735                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
 5736
 5737                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
 5738                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
 5739
 5740                # Command - Special characters (refGene annotation)
 5741                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
 5742
 5743                # Command - Clean empty fields (with value ".")
 5744                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
 5745
 5746                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
 5747                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
 5748                if "ALL" not in annotation_list and "INFO" not in annotation_list:
 5749                    # for ann in annotation_renamed_list:
 5750                    for ann in annotation_list:
 5751                        annovar_fields_to_keep.append(f"^INFO/{ann}")
 5752
 5753                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
 5754
 5755                # Command - indexing
 5756                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
 5757
 5758                log.debug(f"Annotation - Annovar command: {command_annovar}")
 5759                run_parallel_commands([command_annovar], 1)
 5760
 5761                # Error messages
 5762                log.info(f"Error/Warning messages:")
 5763                error_message_command_all = []
 5764                error_message_command_warning = []
 5765                error_message_command_err = []
 5766                for err_file in err_files:
 5767                    with open(err_file, "r") as f:
 5768                        for line in f:
 5769                            message = line.strip()
 5770                            error_message_command_all.append(message)
 5771                            if line.startswith("[W::") or line.startswith("WARNING"):
 5772                                error_message_command_warning.append(message)
 5773                            if line.startswith("[E::") or line.startswith("ERROR"):
 5774                                error_message_command_err.append(
 5775                                    f"{err_file}: " + message
 5776                                )
 5777                # log info
 5778                for message in list(
 5779                    set(error_message_command_err + error_message_command_warning)
 5780                ):
 5781                    log.info(f"   {message}")
 5782                # debug info
 5783                for message in list(set(error_message_command_all)):
 5784                    log.debug(f"   {message}")
 5785                # failed
 5786                if len(error_message_command_err):
 5787                    log.error("Annotation failed: Error in commands")
 5788                    raise ValueError("Annotation failed: Error in commands")
 5789
 5790            if tmp_annotates_vcf_name_list:
 5791
 5792                # List of annotated files
 5793                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
 5794
 5795                # Tmp file
 5796                tmp_annotate_vcf = NamedTemporaryFile(
 5797                    prefix=self.get_prefix(),
 5798                    dir=self.get_tmp_dir(),
 5799                    suffix=".vcf.gz",
 5800                    delete=False,
 5801                )
 5802                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5803                tmp_files.append(tmp_annotate_vcf_name)
 5804                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5805                err_files.append(tmp_annotate_vcf_name_err)
 5806                tmp_files.append(tmp_annotate_vcf_name_err)
 5807
 5808                # Command merge
 5809                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
 5810                log.info(
 5811                    f"Annotation Annovar - Annotation merging "
 5812                    + str(len(tmp_annotates_vcf_name_list))
 5813                    + " annotated files"
 5814                )
 5815                log.debug(f"Annotation - merge command: {merge_command}")
 5816                run_parallel_commands([merge_command], 1)
 5817
 5818                # Find annotation in header
 5819                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
 5820                    header_list = self.read_vcf_header(f)
 5821                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5822
 5823                for ann in annovar_vcf_header.infos:
 5824                    if ann not in self.get_header().infos:
 5825                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5826
 5827                # Update variants
 5828                log.info(f"Annotation Annovar - Updating...")
 5829                self.update_from_vcf(tmp_annotate_vcf_name)
 5830
 5831            # Clean files
 5832            # Tmp file remove command
 5833            if True:
 5834                tmp_files_remove_command = ""
 5835                if tmp_files:
 5836                    tmp_files_remove_command = " ".join(tmp_files)
 5837                clean_command = f" rm -f {tmp_files_remove_command} "
 5838                log.debug(f"Annotation Annovar - Annotation cleaning ")
 5839                log.debug(f"Annotation - cleaning command: {clean_command}")
 5840                run_parallel_commands([clean_command], 1)
 5841
 5842    # Parquet
 5843    def annotation_parquet(self, threads: int = None) -> None:
 5844        """
 5845        It takes a VCF file, and annotates it with a parquet file
 5846
 5847        :param threads: number of threads to use for the annotation
 5848        :return: the value of the variable "result".
 5849        """
 5850
 5851        # DEBUG
 5852        log.debug("Start annotation with parquet databases")
 5853
 5854        # Threads
 5855        if not threads:
 5856            threads = self.get_threads()
 5857        log.debug("Threads: " + str(threads))
 5858
 5859        # DEBUG
 5860        delete_tmp = True
 5861        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5862            delete_tmp = False
 5863            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5864
 5865        # Config
 5866        databases_folders = set(
 5867            self.get_config()
 5868            .get("folders", {})
 5869            .get("databases", {})
 5870            .get("annotations", ["."])
 5871            + self.get_config()
 5872            .get("folders", {})
 5873            .get("databases", {})
 5874            .get("parquet", ["."])
 5875        )
 5876        log.debug("Databases annotations: " + str(databases_folders))
 5877
 5878        # Param
 5879        annotations = (
 5880            self.get_param()
 5881            .get("annotation", {})
 5882            .get("parquet", {})
 5883            .get("annotations", None)
 5884        )
 5885        log.debug("Annotations: " + str(annotations))
 5886
 5887        # Assembly
 5888        assembly = self.get_param().get(
 5889            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 5890        )
 5891
 5892        # Force Update Annotation
 5893        force_update_annotation = (
 5894            self.get_param()
 5895            .get("annotation", {})
 5896            .get("options", {})
 5897            .get("annotations_update", False)
 5898        )
 5899        log.debug(f"force_update_annotation={force_update_annotation}")
 5900        force_append_annotation = (
 5901            self.get_param()
 5902            .get("annotation", {})
 5903            .get("options", {})
 5904            .get("annotations_append", False)
 5905        )
 5906        log.debug(f"force_append_annotation={force_append_annotation}")
 5907
 5908        # Data
 5909        table_variants = self.get_table_variants()
 5910
 5911        # Check if not empty
 5912        log.debug("Check if not empty")
 5913        sql_query_chromosomes_df = self.get_query_to_df(
 5914            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
 5915        )
 5916        if not sql_query_chromosomes_df["count"][0]:
 5917            log.info(f"VCF empty")
 5918            return
 5919
 5920        # VCF header
 5921        vcf_reader = self.get_header()
 5922        log.debug("Initial header: " + str(vcf_reader.infos))
 5923
 5924        # Nb Variants POS
 5925        log.debug("NB Variants Start")
 5926        nb_variants = self.conn.execute(
 5927            f"SELECT count(*) AS count FROM variants"
 5928        ).fetchdf()["count"][0]
 5929        log.debug("NB Variants Stop")
 5930
 5931        # Existing annotations
 5932        for vcf_annotation in self.get_header().infos:
 5933
 5934            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5935            log.debug(
 5936                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5937            )
 5938
 5939        # Added columns
 5940        added_columns = []
 5941
 5942        # drop indexes
 5943        log.debug(f"Drop indexes...")
 5944        self.drop_indexes()
 5945
 5946        if annotations:
 5947
 5948            if "ALL" in annotations:
 5949
 5950                all_param = annotations.get("ALL", {})
 5951                all_param_formats = all_param.get("formats", None)
 5952                all_param_releases = all_param.get("releases", None)
 5953
 5954                databases_infos_dict = self.scan_databases(
 5955                    database_formats=all_param_formats,
 5956                    database_releases=all_param_releases,
 5957                )
 5958                for database_infos in databases_infos_dict.keys():
 5959                    if database_infos not in annotations:
 5960                        annotations[database_infos] = {"INFO": None}
 5961
 5962            for annotation in annotations:
 5963
 5964                if annotation in ["ALL"]:
 5965                    continue
 5966
 5967                # Annotation Name
 5968                annotation_name = os.path.basename(annotation)
 5969
 5970                # Annotation fields
 5971                annotation_fields = annotations[annotation]
 5972                if not annotation_fields:
 5973                    annotation_fields = {"INFO": None}
 5974
 5975                log.debug(f"Annotation '{annotation_name}'")
 5976                log.debug(
 5977                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 5978                )
 5979
 5980                # Create Database
 5981                database = Database(
 5982                    database=annotation,
 5983                    databases_folders=databases_folders,
 5984                    assembly=assembly,
 5985                )
 5986
 5987                # Find files
 5988                parquet_file = database.get_database()
 5989                parquet_hdr_file = database.get_header_file()
 5990                parquet_type = database.get_type()
 5991
 5992                # Check if files exists
 5993                if not parquet_file or not parquet_hdr_file:
 5994                    msg_err_list = []
 5995                    if not parquet_file:
 5996                        msg_err_list.append(
 5997                            f"Annotation failed: Annotation file not found"
 5998                        )
 5999                    if parquet_file and not parquet_hdr_file:
 6000                        msg_err_list.append(
 6001                            f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'"
 6002                        )
 6003
 6004                    log.error(". ".join(msg_err_list))
 6005                    raise ValueError(". ".join(msg_err_list))
 6006                else:
 6007                    # Get parquet connexion
 6008                    parquet_sql_attach = database.get_sql_database_attach(
 6009                        output="query"
 6010                    )
 6011                    if parquet_sql_attach:
 6012                        self.conn.execute(parquet_sql_attach)
 6013                    parquet_file_link = database.get_sql_database_link()
 6014                    # Log
 6015                    log.debug(
 6016                        f"Annotation '{annotation_name}' - file: "
 6017                        + str(parquet_file)
 6018                        + " and "
 6019                        + str(parquet_hdr_file)
 6020                    )
 6021
 6022                    # Database full header columns
 6023                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
 6024                        parquet_hdr_file
 6025                    )
 6026                    # Log
 6027                    log.debug(
 6028                        "Annotation database header columns : "
 6029                        + str(parquet_hdr_vcf_header_columns)
 6030                    )
 6031
 6032                    # Load header as VCF object
 6033                    parquet_hdr_vcf_header_infos = database.get_header().infos
 6034                    # Log
 6035                    log.debug(
 6036                        "Annotation database header: "
 6037                        + str(parquet_hdr_vcf_header_infos)
 6038                    )
 6039
 6040                    # Get extra infos
 6041                    parquet_columns = database.get_extra_columns()
 6042                    # Log
 6043                    log.debug("Annotation database Columns: " + str(parquet_columns))
 6044
 6045                    # Add extra columns if "ALL" in annotation_fields
 6046                    # if "ALL" in annotation_fields:
 6047                    #     allow_add_extra_column = True
 6048                    if "ALL" in annotation_fields and database.get_extra_columns():
 6049                        for extra_column in database.get_extra_columns():
 6050                            if (
 6051                                extra_column not in annotation_fields
 6052                                and extra_column.replace("INFO/", "")
 6053                                not in parquet_hdr_vcf_header_infos
 6054                            ):
 6055                                parquet_hdr_vcf_header_infos[extra_column] = (
 6056                                    vcf.parser._Info(
 6057                                        extra_column,
 6058                                        ".",
 6059                                        "String",
 6060                                        f"{extra_column} description",
 6061                                        "unknown",
 6062                                        "unknown",
 6063                                        self.code_type_map["String"],
 6064                                    )
 6065                                )
 6066
 6067                    # For all fields in database
 6068                    annotation_fields_all = False
 6069                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 6070                        annotation_fields_all = True
 6071                        annotation_fields = {
 6072                            key: key for key in parquet_hdr_vcf_header_infos
 6073                        }
 6074
 6075                        log.debug(
 6076                            "Annotation database header - All annotations added: "
 6077                            + str(annotation_fields)
 6078                        )
 6079
 6080                    # Init
 6081
 6082                    # List of annotation fields to use
 6083                    sql_query_annotation_update_info_sets = []
 6084
 6085                    # List of annotation to agregate
 6086                    sql_query_annotation_to_agregate = []
 6087
 6088                    # Number of fields
 6089                    nb_annotation_field = 0
 6090
 6091                    # Annotation fields processed
 6092                    annotation_fields_processed = []
 6093
 6094                    # Columns mapping
 6095                    map_columns = database.map_columns(
 6096                        columns=annotation_fields, prefixes=["INFO/"]
 6097                    )
 6098
 6099                    # Query dict for fields to remove (update option)
 6100                    query_dict_remove = {}
 6101
 6102                    # Fetch Anotation fields
 6103                    for annotation_field in annotation_fields:
 6104
 6105                        # annotation_field_column
 6106                        annotation_field_column = map_columns.get(
 6107                            annotation_field, "INFO"
 6108                        )
 6109
 6110                        # field new name, if parametered
 6111                        annotation_fields_new_name = annotation_fields.get(
 6112                            annotation_field, annotation_field
 6113                        )
 6114                        if not annotation_fields_new_name:
 6115                            annotation_fields_new_name = annotation_field
 6116
 6117                        # To annotate
 6118                        # force_update_annotation = True
 6119                        # force_append_annotation = True
 6120                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
 6121                        if annotation_field in parquet_hdr_vcf_header_infos and (
 6122                            force_update_annotation
 6123                            or force_append_annotation
 6124                            or (
 6125                                annotation_fields_new_name
 6126                                not in self.get_header().infos
 6127                            )
 6128                        ):
 6129
 6130                            # Add field to annotation to process list
 6131                            annotation_fields_processed.append(
 6132                                annotation_fields_new_name
 6133                            )
 6134
 6135                            # explode infos for the field
 6136                            annotation_fields_new_name_info_msg = ""
 6137                            if (
 6138                                force_update_annotation
 6139                                and annotation_fields_new_name
 6140                                in self.get_header().infos
 6141                            ):
 6142                                # Remove field from INFO
 6143                                query = f"""
 6144                                    UPDATE {table_variants} as table_variants
 6145                                    SET INFO = REGEXP_REPLACE(
 6146                                                concat(table_variants.INFO,''),
 6147                                                ';*{annotation_fields_new_name}=[^;]*',
 6148                                                ''
 6149                                                )
 6150                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
 6151                                """
 6152                                annotation_fields_new_name_info_msg = " [update]"
 6153                                query_dict_remove[
 6154                                    f"remove 'INFO/{annotation_fields_new_name}'"
 6155                                ] = query
 6156
 6157                            # Sep between fields in INFO
 6158                            nb_annotation_field += 1
 6159                            if nb_annotation_field > 1:
 6160                                annotation_field_sep = ";"
 6161                            else:
 6162                                annotation_field_sep = ""
 6163
 6164                            log.info(
 6165                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
 6166                            )
 6167
 6168                            # Add INFO field to header
 6169                            parquet_hdr_vcf_header_infos_number = (
 6170                                parquet_hdr_vcf_header_infos[annotation_field].num
 6171                                or "."
 6172                            )
 6173                            parquet_hdr_vcf_header_infos_type = (
 6174                                parquet_hdr_vcf_header_infos[annotation_field].type
 6175                                or "String"
 6176                            )
 6177                            parquet_hdr_vcf_header_infos_description = (
 6178                                parquet_hdr_vcf_header_infos[annotation_field].desc
 6179                                or f"{annotation_field} description"
 6180                            )
 6181                            parquet_hdr_vcf_header_infos_source = (
 6182                                parquet_hdr_vcf_header_infos[annotation_field].source
 6183                                or "unknown"
 6184                            )
 6185                            parquet_hdr_vcf_header_infos_version = (
 6186                                parquet_hdr_vcf_header_infos[annotation_field].version
 6187                                or "unknown"
 6188                            )
 6189
 6190                            vcf_reader.infos[annotation_fields_new_name] = (
 6191                                vcf.parser._Info(
 6192                                    annotation_fields_new_name,
 6193                                    parquet_hdr_vcf_header_infos_number,
 6194                                    parquet_hdr_vcf_header_infos_type,
 6195                                    parquet_hdr_vcf_header_infos_description,
 6196                                    parquet_hdr_vcf_header_infos_source,
 6197                                    parquet_hdr_vcf_header_infos_version,
 6198                                    self.code_type_map[
 6199                                        parquet_hdr_vcf_header_infos_type
 6200                                    ],
 6201                                )
 6202                            )
 6203
 6204                            # Append
 6205                            if force_append_annotation:
 6206                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
 6207                            else:
 6208                                query_case_when_append = ""
 6209
 6210                            # Annotation/Update query fields
 6211                            # Found in INFO column
 6212                            if (
 6213                                annotation_field_column == "INFO"
 6214                                and "INFO" in parquet_hdr_vcf_header_columns
 6215                            ):
 6216                                sql_query_annotation_update_info_sets.append(
 6217                                    f"""
 6218                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
 6219                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
 6220                                        ELSE ''
 6221                                    END
 6222                                """
 6223                                )
 6224                            # Found in a specific column
 6225                            else:
 6226                                sql_query_annotation_update_info_sets.append(
 6227                                    f"""
 6228                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
 6229                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
 6230                                        ELSE ''
 6231                                    END
 6232                                """
 6233                                )
 6234                                sql_query_annotation_to_agregate.append(
 6235                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
 6236                                )
 6237
 6238                        # Not to annotate
 6239                        else:
 6240
 6241                            if force_update_annotation:
 6242                                annotation_message = "forced"
 6243                            else:
 6244                                annotation_message = "skipped"
 6245
 6246                            if annotation_field not in parquet_hdr_vcf_header_infos:
 6247                                log.warning(
 6248                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
 6249                                )
 6250                            if annotation_fields_new_name in self.get_header().infos:
 6251                                log.warning(
 6252                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
 6253                                )
 6254
 6255                    # Check if ALL fields have to be annotated. Thus concat all INFO field
 6256                    # allow_annotation_full_info = True
 6257                    allow_annotation_full_info = not force_append_annotation
 6258
 6259                    if parquet_type in ["regions"]:
 6260                        allow_annotation_full_info = False
 6261
 6262                    if (
 6263                        allow_annotation_full_info
 6264                        and nb_annotation_field == len(annotation_fields)
 6265                        and annotation_fields_all
 6266                        and (
 6267                            "INFO" in parquet_hdr_vcf_header_columns
 6268                            and "INFO" in database.get_extra_columns()
 6269                        )
 6270                    ):
 6271                        log.debug("Column INFO annotation enabled")
 6272                        sql_query_annotation_update_info_sets = []
 6273                        sql_query_annotation_update_info_sets.append(
 6274                            f" table_parquet.INFO "
 6275                        )
 6276
 6277                    if sql_query_annotation_update_info_sets:
 6278
 6279                        # Annotate
 6280                        log.info(f"Annotation '{annotation_name}' - Annotation...")
 6281
 6282                        # Join query annotation update info sets for SQL
 6283                        sql_query_annotation_update_info_sets_sql = ",".join(
 6284                            sql_query_annotation_update_info_sets
 6285                        )
 6286
 6287                        # Check chromosomes list (and variants infos)
 6288                        sql_query_chromosomes = f"""
 6289                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
 6290                            FROM {table_variants} as table_variants
 6291                            GROUP BY table_variants."#CHROM"
 6292                            ORDER BY table_variants."#CHROM"
 6293                            """
 6294                        sql_query_chromosomes_df = self.conn.execute(
 6295                            sql_query_chromosomes
 6296                        ).df()
 6297                        sql_query_chromosomes_dict = {
 6298                            entry["CHROM"]: {
 6299                                "count": entry["count_variants"],
 6300                                "min": entry["min_variants"],
 6301                                "max": entry["max_variants"],
 6302                            }
 6303                            for index, entry in sql_query_chromosomes_df.iterrows()
 6304                        }
 6305
 6306                        # Init
 6307                        nb_of_query = 0
 6308                        nb_of_variant_annotated = 0
 6309                        query_dict = query_dict_remove
 6310
 6311                        # for chrom in sql_query_chromosomes_df["CHROM"]:
 6312                        for chrom in sql_query_chromosomes_dict:
 6313
 6314                            # Number of variant by chromosome
 6315                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
 6316                                chrom, {}
 6317                            ).get("count", 0)
 6318
 6319                            log.debug(
 6320                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
 6321                            )
 6322
 6323                            # Annotation with regions database
 6324                            if parquet_type in ["regions"]:
 6325                                sql_query_annotation_from_clause = f"""
 6326                                    FROM (
 6327                                        SELECT 
 6328                                            '{chrom}' AS \"#CHROM\",
 6329                                            table_variants_from.\"POS\" AS \"POS\",
 6330                                            {",".join(sql_query_annotation_to_agregate)}
 6331                                        FROM {table_variants} as table_variants_from
 6332                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
 6333                                            table_parquet_from."#CHROM" = '{chrom}'
 6334                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
 6335                                            AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
 6336                                        )
 6337                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
 6338                                        GROUP BY table_variants_from.\"POS\"
 6339                                        )
 6340                                        as table_parquet
 6341                                """
 6342
 6343                                sql_query_annotation_where_clause = """
 6344                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
 6345                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6346                                """
 6347
 6348                            # Annotation with variants database
 6349                            else:
 6350                                sql_query_annotation_from_clause = f"""
 6351                                    FROM {parquet_file_link} as table_parquet
 6352                                """
 6353                                sql_query_annotation_where_clause = f"""
 6354                                    table_variants."#CHROM" = '{chrom}'
 6355                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
 6356                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6357                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 6358                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 6359                                """
 6360
 6361                            # Create update query
 6362                            sql_query_annotation_chrom_interval_pos = f"""
 6363                                UPDATE {table_variants} as table_variants
 6364                                    SET INFO = 
 6365                                        concat(
 6366                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6367                                                THEN table_variants.INFO
 6368                                                ELSE ''
 6369                                            END
 6370                                            ,
 6371                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6372                                                        AND (
 6373                                                        concat({sql_query_annotation_update_info_sets_sql})
 6374                                                        )
 6375                                                        NOT IN ('','.') 
 6376                                                    THEN ';'
 6377                                                    ELSE ''
 6378                                            END
 6379                                            ,
 6380                                            {sql_query_annotation_update_info_sets_sql}
 6381                                            )
 6382                                    {sql_query_annotation_from_clause}
 6383                                    WHERE {sql_query_annotation_where_clause}
 6384                                    ;
 6385                                """
 6386
 6387                            # Add update query to dict
 6388                            query_dict[
 6389                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
 6390                            ] = sql_query_annotation_chrom_interval_pos
 6391
 6392                        nb_of_query = len(query_dict)
 6393                        num_query = 0
 6394
 6395                        # SET max_expression_depth TO x
 6396                        self.conn.execute("SET max_expression_depth TO 10000")
 6397
 6398                        for query_name in query_dict:
 6399                            query = query_dict[query_name]
 6400                            num_query += 1
 6401                            log.info(
 6402                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
 6403                            )
 6404                            result = self.conn.execute(query)
 6405                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
 6406                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
 6407                            log.info(
 6408                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
 6409                            )
 6410
 6411                        log.info(
 6412                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
 6413                        )
 6414
 6415                    else:
 6416
 6417                        log.info(
 6418                            f"Annotation '{annotation_name}' - No Annotations available"
 6419                        )
 6420
 6421                    log.debug("Final header: " + str(vcf_reader.infos))
 6422
 6423        # Remove added columns
 6424        for added_column in added_columns:
 6425            self.drop_column(column=added_column)
 6426
 6427    def annotation_splice(self, threads: int = None) -> None:
 6428        """
 6429        This function annotate with snpEff
 6430
 6431        :param threads: The number of threads to use
 6432        :return: the value of the variable "return_value".
 6433        """
 6434
 6435        # DEBUG
 6436        log.debug("Start annotation with splice tools")
 6437
 6438        # Threads
 6439        if not threads:
 6440            threads = self.get_threads()
 6441        log.debug("Threads: " + str(threads))
 6442
 6443        # DEBUG
 6444        delete_tmp = True
 6445        if self.get_config().get("verbosity", "warning") in ["debug"]:
 6446            delete_tmp = False
 6447            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 6448
 6449        # Config
 6450        config = self.get_config()
 6451        log.debug("Config: " + str(config))
 6452        splice_config = config.get("tools", {}).get("splice", {})
 6453        if not splice_config:
 6454            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
 6455            msg_err = "No Splice tool config"
 6456            raise ValueError(msg_err)
 6457        log.debug(f"splice_config: {splice_config}")
 6458
 6459        # Config - Folders - Databases
 6460        databases_folders = (
 6461            config.get("folders", {}).get("databases", {}).get("splice", ["."])
 6462        )
 6463        log.debug("Databases annotations: " + str(databases_folders))
 6464
 6465        # Splice docker image
 6466        splice_docker_image = splice_config.get("docker").get("image")
 6467
 6468        # Pull splice image if it's not already there
 6469        if not check_docker_image_exists(splice_docker_image):
 6470            log.warning(
 6471                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
 6472            )
 6473            try:
 6474                command(f"docker pull {splice_config.get('docker').get('image')}")
 6475            except subprocess.CalledProcessError:
 6476                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
 6477                log.error(msg_err)
 6478                raise ValueError(msg_err)
 6479
 6480        # Config - splice databases
 6481        splice_databases = (
 6482            config.get("folders", {})
 6483            .get("databases", {})
 6484            .get("splice", DEFAULT_SPLICE_FOLDER)
 6485        )
 6486        splice_databases = full_path(splice_databases)
 6487
 6488        # Param
 6489        param = self.get_param()
 6490        log.debug("Param: " + str(param))
 6491
 6492        # Param
 6493        options = param.get("annotation", {}).get("splice", {}).get("options", {})
 6494        log.debug("Options: " + str(options))
 6495
 6496        # Data
 6497        table_variants = self.get_table_variants()
 6498
 6499        # Check if not empty
 6500        log.debug("Check if not empty")
 6501        sql_query_chromosomes = (
 6502            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 6503        )
 6504        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 6505            log.info("VCF empty")
 6506            return None
 6507
 6508        # Export in VCF
 6509        log.debug("Create initial file to annotate")
 6510
 6511        # Create output folder / work folder
 6512        if options.get("output_folder", ""):
 6513            output_folder = options.get("output_folder", "")
 6514            if not os.path.exists(output_folder):
 6515                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6516        else:
 6517            output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
 6518            if not os.path.exists(output_folder):
 6519                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6520
 6521        if options.get("workdir", ""):
 6522            workdir = options.get("workdir", "")
 6523        else:
 6524            workdir = "/work"
 6525
 6526        # Create tmp VCF file
 6527        tmp_vcf = NamedTemporaryFile(
 6528            prefix=self.get_prefix(),
 6529            dir=output_folder,
 6530            suffix=".vcf",
 6531            delete=False,
 6532        )
 6533        tmp_vcf_name = tmp_vcf.name
 6534
 6535        # VCF header
 6536        header = self.get_header()
 6537
 6538        # Existing annotations
 6539        for vcf_annotation in self.get_header().infos:
 6540
 6541            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 6542            log.debug(
 6543                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 6544            )
 6545
 6546        # Memory limit
 6547        if config.get("memory", None):
 6548            memory_limit = config.get("memory", "8G").upper()
 6549            # upper()
 6550        else:
 6551            memory_limit = "8G"
 6552        log.debug(f"memory_limit: {memory_limit}")
 6553
 6554        # Check number of variants to annotate
 6555        where_clause_regex_spliceai = r"SpliceAI_\w+"
 6556        where_clause_regex_spip = r"SPiP_\w+"
 6557        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
 6558        df_list_of_variants_to_annotate = self.get_query_to_df(
 6559            query=f""" SELECT * FROM variants {where_clause} """
 6560        )
 6561        if len(df_list_of_variants_to_annotate) == 0:
 6562            log.warning(
 6563                f"No variants to annotate with splice. Variants probably already annotated with splice"
 6564            )
 6565            return None
 6566        else:
 6567            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
 6568
 6569        # Export VCF file
 6570        self.export_variant_vcf(
 6571            vcf_file=tmp_vcf_name,
 6572            remove_info=True,
 6573            add_samples=True,
 6574            index=False,
 6575            where_clause=where_clause,
 6576        )
 6577        mount = [f" -v {path}:{path}:rw" for path in [output_folder]]
 6578        if any(value for value in splice_config.values() if value is None):
 6579            log.warning("At least one splice config parameter is empty")
 6580            # exit annotation_splice
 6581            return None
 6582
 6583        # Params in splice nf
 6584        def check_values(dico: dict):
 6585            """
 6586            Ensure parameters for NF splice pipeline
 6587            """
 6588            for key, val in dico.items():
 6589                if key == "genome":
 6590                    if any(
 6591                        assemb in options.get("genome", {})
 6592                        for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
 6593                    ):
 6594                        yield f"--{key} hg19"
 6595                    elif any(
 6596                        assemb in options.get("genome", {})
 6597                        for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
 6598                    ):
 6599                        yield f"--{key} hg38"
 6600                elif (
 6601                    (isinstance(val, str) and val)
 6602                    or isinstance(val, int)
 6603                    or isinstance(val, bool)
 6604                ):
 6605                    yield f"--{key} {val}"
 6606
 6607        # Genome
 6608        genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
 6609        options["genome"] = genome
 6610        # NF params
 6611        nf_params = []
 6612        # Add options
 6613        if options:
 6614            log.debug(options)
 6615            nf_params = list(check_values(options))
 6616            log.debug(f"Splice NF params: {' '.join(nf_params)}")
 6617        else:
 6618            log.debug("No NF params provided")
 6619        # Add threads
 6620        if "threads" not in options.keys():
 6621            nf_params.append(f"--threads {threads}")
 6622        # Genome path
 6623        genome_path = find_genome(
 6624            config.get("folders", {})
 6625            .get("databases", {})
 6626            .get("genomes", DEFAULT_GENOME_FOLDER),
 6627            file=f"{genome}.fa",
 6628        )
 6629        # Add genome path
 6630        if not genome_path:
 6631            raise ValueError(
 6632                f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
 6633            )
 6634        else:
 6635            log.debug(f"Genome: {genome_path}")
 6636            nf_params.append(f"--genome_path {genome_path}")
 6637
 6638        def splice_annotations(options: dict = {}, config: dict = {}) -> list:
 6639            """
 6640            Setting up updated databases for SPiP and SpliceAI
 6641            """
 6642
 6643            try:
 6644
 6645                # SpliceAI assembly transcriptome
 6646                spliceai_assembly = os.path.join(
 6647                    config.get("folders", {}).get("databases", {}).get("spliceai", {}),
 6648                    options.get("genome"),
 6649                    "transcriptome",
 6650                )
 6651                spip_assembly = options.get("genome")
 6652
 6653                spip = find(
 6654                    f"transcriptome_{spip_assembly}.RData",
 6655                    config.get("folders", {}).get("databases", {}).get("spip", {}),
 6656                )
 6657                spliceai = find("spliceai.refseq.txt", spliceai_assembly)
 6658                log.debug(f"SPiP annotations: {spip}")
 6659                log.debug(f"SpliceAI annotations: {spliceai}")
 6660                if spip and spliceai:
 6661                    return [
 6662                        f"--spip_transcriptome {spip}",
 6663                        f"--spliceai_transcriptome {spliceai}",
 6664                    ]
 6665                else:
 6666                    log.warning(
 6667                        "Can't find splice databases in configuration, use annotations file from image"
 6668                    )
 6669            except TypeError:
 6670                log.warning(
 6671                    "Can't find splice databases in configuration, use annotations file from image"
 6672                )
 6673                return []
 6674
 6675        # Add options, check if transcriptome option have already beend provided
 6676        if (
 6677            "spip_transcriptome" not in nf_params
 6678            and "spliceai_transcriptome" not in nf_params
 6679        ):
 6680            splice_reference = splice_annotations(options, config)
 6681            if splice_reference:
 6682                nf_params.extend(splice_reference)
 6683        # nf_params.append(f"--output_folder {output_folder}")
 6684        random_uuid = f"HOWARD-SPLICE-{get_random()}"
 6685        cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
 6686        log.debug(cmd)
 6687        splice_config["docker"]["command"] = cmd
 6688
 6689        # Ensure proxy is set
 6690        proxy = [
 6691            f"-e {var}={os.getenv(var)}"
 6692            for var in ["https_proxy", "http_proxy", "ftp_proxy"]
 6693            if os.getenv(var) is not None
 6694        ]
 6695        docker_cmd = get_bin_command(
 6696            tool="splice",
 6697            bin_type="docker",
 6698            config=config,
 6699            default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
 6700            add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}",
 6701        )
 6702        # print(docker_cmd)
 6703        # exit()
 6704        # Docker debug
 6705        # if splice_config.get("rm_container"):
 6706        #     rm_container = "--rm"
 6707        # else:
 6708        #     rm_container = ""
 6709        # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
 6710        log.debug(docker_cmd)
 6711        res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
 6712        log.debug(res.stdout)
 6713        if res.stderr:
 6714            log.error(res.stderr)
 6715        res.check_returncode()
 6716        # Update variants
 6717        log.info("Annotation - Updating...")
 6718        # Test find output vcf
 6719        log.debug(
 6720            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6721        )
 6722        output_vcf = []
 6723        # Wrong folder to look in
 6724        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
 6725            if (
 6726                files
 6727                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6728            ):
 6729                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
 6730        # log.debug(os.listdir(options.get("output_folder")))
 6731        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
 6732        if not output_vcf:
 6733            log.debug(
 6734                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
 6735            )
 6736        else:
 6737            # Get new header from annotated vcf
 6738            log.debug(f"Initial header: {len(header.infos)} fields")
 6739            # Create new header with splice infos
 6740            new_vcf = Variants(input=output_vcf[0])
 6741            new_vcf_header = new_vcf.get_header().infos
 6742            for keys, infos in new_vcf_header.items():
 6743                if keys not in header.infos.keys():
 6744                    header.infos[keys] = infos
 6745            log.debug(f"New header: {len(header.infos)} fields")
 6746            log.debug(f"Splice tmp output: {output_vcf[0]}")
 6747            self.update_from_vcf(output_vcf[0])
 6748
 6749        # Remove file
 6750        remove_if_exists(output_vcf)
 6751
 6752    ###
 6753    # Prioritization
 6754    ###
 6755
 6756    def get_config_default(self, name: str) -> dict:
 6757        """
 6758        The function `get_config_default` returns a dictionary containing default configurations for
 6759        various calculations and prioritizations.
 6760
 6761        :param name: The `get_config_default` function returns a dictionary containing default
 6762        configurations for different calculations and prioritizations. The `name` parameter is used to
 6763        specify which specific configuration to retrieve from the dictionary
 6764        :type name: str
 6765        :return: The function `get_config_default` returns a dictionary containing default configuration
 6766        settings for different calculations and prioritizations. The specific configuration settings are
 6767        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
 6768        matches a key in the `config_default` dictionary, the corresponding configuration settings are
 6769        returned. If there is no match, an empty dictionary is returned.
 6770        """
 6771
 6772        config_default = {
 6773            "calculations": {
 6774                "variant_chr_pos_alt_ref": {
 6775                    "type": "sql",
 6776                    "name": "variant_chr_pos_alt_ref",
 6777                    "description": "Create a variant ID with chromosome, position, alt and ref",
 6778                    "available": False,
 6779                    "output_column_name": "variant_chr_pos_alt_ref",
 6780                    "output_column_type": "String",
 6781                    "output_column_description": "variant ID with chromosome, position, alt and ref",
 6782                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
 6783                    "operation_info": True,
 6784                },
 6785                "VARTYPE": {
 6786                    "type": "sql",
 6787                    "name": "VARTYPE",
 6788                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
 6789                    "available": True,
 6790                    "table": "variants",
 6791                    "output_column_name": "VARTYPE",
 6792                    "output_column_type": "String",
 6793                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
 6794                    "operation_query": """
 6795                            CASE
 6796                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
 6797                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
 6798                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
 6799                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
 6800                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
 6801                                ELSE 'UNDEFINED'
 6802                            END
 6803                            """,
 6804                    "info_fields": ["SVTYPE"],
 6805                    "operation_info": True,
 6806                },
 6807                "snpeff_hgvs": {
 6808                    "type": "python",
 6809                    "name": "snpeff_hgvs",
 6810                    "description": "HGVS nomenclatures from snpEff annotation",
 6811                    "available": True,
 6812                    "function_name": "calculation_extract_snpeff_hgvs",
 6813                    "function_params": ["snpeff_hgvs", "ANN"],
 6814                },
 6815                "snpeff_ann_explode": {
 6816                    "type": "python",
 6817                    "name": "snpeff_ann_explode",
 6818                    "description": "Explode snpEff annotations with uniquify values",
 6819                    "available": True,
 6820                    "function_name": "calculation_snpeff_ann_explode",
 6821                    "function_params": [False, "fields", "snpeff_", "ANN"],
 6822                },
 6823                "snpeff_ann_explode_uniquify": {
 6824                    "type": "python",
 6825                    "name": "snpeff_ann_explode_uniquify",
 6826                    "description": "Explode snpEff annotations",
 6827                    "available": True,
 6828                    "function_name": "calculation_snpeff_ann_explode",
 6829                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
 6830                },
 6831                "snpeff_ann_explode_json": {
 6832                    "type": "python",
 6833                    "name": "snpeff_ann_explode_json",
 6834                    "description": "Explode snpEff annotations in JSON format",
 6835                    "available": True,
 6836                    "function_name": "calculation_snpeff_ann_explode",
 6837                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
 6838                },
 6839                "NOMEN": {
 6840                    "type": "python",
 6841                    "name": "NOMEN",
 6842                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)",
 6843                    "available": True,
 6844                    "function_name": "calculation_extract_nomen",
 6845                    "function_params": [],
 6846                },
 6847                "RENAME_INFO_FIELDS": {
 6848                    "type": "python",
 6849                    "name": "RENAME_INFO_FIELDS",
 6850                    "description": "Rename or remove INFO/tags",
 6851                    "available": True,
 6852                    "function_name": "calculation_rename_info_fields",
 6853                    "function_params": [],
 6854                },
 6855                "FINDBYPIPELINE": {
 6856                    "type": "python",
 6857                    "name": "FINDBYPIPELINE",
 6858                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
 6859                    "available": True,
 6860                    "function_name": "calculation_find_by_pipeline",
 6861                    "function_params": ["findbypipeline"],
 6862                },
 6863                "FINDBYSAMPLE": {
 6864                    "type": "python",
 6865                    "name": "FINDBYSAMPLE",
 6866                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
 6867                    "available": True,
 6868                    "function_name": "calculation_find_by_pipeline",
 6869                    "function_params": ["findbysample"],
 6870                },
 6871                "GENOTYPECONCORDANCE": {
 6872                    "type": "python",
 6873                    "name": "GENOTYPECONCORDANCE",
 6874                    "description": "Concordance of genotype for multi caller VCF",
 6875                    "available": True,
 6876                    "function_name": "calculation_genotype_concordance",
 6877                    "function_params": [],
 6878                },
 6879                "BARCODE": {
 6880                    "type": "python",
 6881                    "name": "BARCODE",
 6882                    "description": "BARCODE as VaRank tool",
 6883                    "available": True,
 6884                    "function_name": "calculation_barcode",
 6885                    "function_params": [],
 6886                },
 6887                "BARCODEFAMILY": {
 6888                    "type": "python",
 6889                    "name": "BARCODEFAMILY",
 6890                    "description": "BARCODEFAMILY as VaRank tool",
 6891                    "available": True,
 6892                    "function_name": "calculation_barcode_family",
 6893                    "function_params": ["BCF"],
 6894                },
 6895                "TRIO": {
 6896                    "type": "python",
 6897                    "name": "TRIO",
 6898                    "description": "Inheritance for a trio family",
 6899                    "available": True,
 6900                    "function_name": "calculation_trio",
 6901                    "function_params": [],
 6902                },
 6903                "VAF": {
 6904                    "type": "python",
 6905                    "name": "VAF",
 6906                    "description": "Variant Allele Frequency (VAF) harmonization",
 6907                    "available": True,
 6908                    "function_name": "calculation_vaf_normalization",
 6909                    "function_params": [],
 6910                },
 6911                "VAF_stats": {
 6912                    "type": "python",
 6913                    "name": "VAF_stats",
 6914                    "description": "Variant Allele Frequency (VAF) statistics",
 6915                    "available": True,
 6916                    "function_name": "calculation_genotype_stats",
 6917                    "function_params": ["VAF"],
 6918                },
 6919                "DP_stats": {
 6920                    "type": "python",
 6921                    "name": "DP_stats",
 6922                    "description": "Depth (DP) statistics",
 6923                    "available": True,
 6924                    "function_name": "calculation_genotype_stats",
 6925                    "function_params": ["DP"],
 6926                },
 6927                "variant_id": {
 6928                    "type": "python",
 6929                    "name": "variant_id",
 6930                    "description": "Variant ID generated from variant position and type",
 6931                    "available": True,
 6932                    "function_name": "calculation_variant_id",
 6933                    "function_params": [],
 6934                },
 6935                "transcripts_json": {
 6936                    "type": "python",
 6937                    "name": "transcripts_json",
 6938                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
 6939                    "available": True,
 6940                    "function_name": "calculation_transcripts_annotation",
 6941                    "function_params": ["transcripts_json", None],
 6942                },
 6943                "transcripts_ann": {
 6944                    "type": "python",
 6945                    "name": "transcripts_ann",
 6946                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
 6947                    "available": True,
 6948                    "function_name": "calculation_transcripts_annotation",
 6949                    "function_params": [None, "transcripts_ann"],
 6950                },
 6951                "transcripts_annotations": {
 6952                    "type": "python",
 6953                    "name": "transcripts_annotations",
 6954                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
 6955                    "available": True,
 6956                    "function_name": "calculation_transcripts_annotation",
 6957                    "function_params": [None, None],
 6958                },
 6959                "transcripts_prioritization": {
 6960                    "type": "python",
 6961                    "name": "transcripts_prioritization",
 6962                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
 6963                    "available": True,
 6964                    "function_name": "calculation_transcripts_prioritization",
 6965                    "function_params": [],
 6966                },
 6967                "transcripts_export": {
 6968                    "type": "python",
 6969                    "name": "transcripts_export",
 6970                    "description": "Export transcripts table/view as a file (using param.json)",
 6971                    "available": True,
 6972                    "function_name": "calculation_transcripts_export",
 6973                    "function_params": [],
 6974                },
 6975            },
 6976            "prioritizations": {
 6977                "default": {
 6978                    "ANN2": [
 6979                        {
 6980                            "type": "contains",
 6981                            "value": "HIGH",
 6982                            "score": 5,
 6983                            "flag": "PASS",
 6984                            "comment": [
 6985                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
 6986                            ],
 6987                        },
 6988                        {
 6989                            "type": "contains",
 6990                            "value": "MODERATE",
 6991                            "score": 3,
 6992                            "flag": "PASS",
 6993                            "comment": [
 6994                                "A non-disruptive variant that might change protein effectiveness"
 6995                            ],
 6996                        },
 6997                        {
 6998                            "type": "contains",
 6999                            "value": "LOW",
 7000                            "score": 0,
 7001                            "flag": "FILTERED",
 7002                            "comment": [
 7003                                "Assumed to be mostly harmless or unlikely to change protein behavior"
 7004                            ],
 7005                        },
 7006                        {
 7007                            "type": "contains",
 7008                            "value": "MODIFIER",
 7009                            "score": 0,
 7010                            "flag": "FILTERED",
 7011                            "comment": [
 7012                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
 7013                            ],
 7014                        },
 7015                    ],
 7016                }
 7017            },
 7018        }
 7019
 7020        return config_default.get(name, None)
 7021
 7022    def get_config_json(
 7023        self, name: str, config_dict: dict = {}, config_file: str = None
 7024    ) -> dict:
 7025        """
 7026        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
 7027        default values, a dictionary, and a file.
 7028
 7029        :param name: The `name` parameter in the `get_config_json` function is a string that represents
 7030        the name of the configuration. It is used to identify and retrieve the configuration settings
 7031        for a specific component or module
 7032        :type name: str
 7033        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
 7034        dictionary that allows you to provide additional configuration settings or overrides. When you
 7035        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
 7036        the key is the configuration setting you want to override or
 7037        :type config_dict: dict
 7038        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
 7039        specify the path to a configuration file that contains additional settings. If provided, the
 7040        function will read the contents of this file and update the configuration dictionary with the
 7041        values found in the file, overriding any existing values with the
 7042        :type config_file: str
 7043        :return: The function `get_config_json` returns a dictionary containing the configuration
 7044        settings.
 7045        """
 7046
 7047        # Create with default prioritizations
 7048        config_default = self.get_config_default(name=name)
 7049        configuration = config_default
 7050        # log.debug(f"configuration={configuration}")
 7051
 7052        # Replace prioritizations from dict
 7053        for config in config_dict:
 7054            configuration[config] = config_dict[config]
 7055
 7056        # Replace prioritizations from file
 7057        config_file = full_path(config_file)
 7058        if config_file:
 7059            if os.path.exists(config_file):
 7060                with open(config_file) as config_file_content:
 7061                    config_file_dict = yaml.safe_load(config_file_content)
 7062                for config in config_file_dict:
 7063                    configuration[config] = config_file_dict[config]
 7064            else:
 7065                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
 7066                log.error(msg_error)
 7067                raise ValueError(msg_error)
 7068
 7069        return configuration
 7070
 7071    def prioritization(
 7072        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
 7073    ) -> bool:
 7074        """
 7075        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
 7076        prioritizes variants based on configured profiles and criteria.
 7077
 7078        :param table: The `table` parameter in the `prioritization` function is used to specify the name
 7079        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
 7080        a table name is provided, the method will prioritize the variants in that specific table
 7081        :type table: str
 7082        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
 7083        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
 7084        provided, the code will use a default prefix value of "PZ"
 7085        :type pz_prefix: str
 7086        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
 7087        additional parameters specific to the prioritization process. These parameters can include
 7088        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
 7089        configurations needed for the prioritization of variants in a V
 7090        :type pz_param: dict
 7091        :return: A boolean value (True) is being returned from the `prioritization` function.
 7092        """
 7093
 7094        # Config
 7095        config = self.get_config()
 7096
 7097        # Param
 7098        param = self.get_param()
 7099
 7100        # Prioritization param
 7101        if pz_param is not None:
 7102            prioritization_param = pz_param
 7103        else:
 7104            prioritization_param = param.get("prioritization", {})
 7105
 7106        # Configuration profiles
 7107        prioritization_config_file = prioritization_param.get(
 7108            "prioritization_config", None
 7109        )
 7110        prioritization_config_file = full_path(prioritization_config_file)
 7111        prioritizations_config = self.get_config_json(
 7112            name="prioritizations", config_file=prioritization_config_file
 7113        )
 7114
 7115        # Prioritization prefix
 7116        pz_prefix_default = "PZ"
 7117        if pz_prefix is None:
 7118            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
 7119
 7120        # Prioritization options
 7121        profiles = prioritization_param.get("profiles", [])
 7122        if isinstance(profiles, str):
 7123            profiles = profiles.split(",")
 7124        pzfields = prioritization_param.get(
 7125            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
 7126        )
 7127        if isinstance(pzfields, str):
 7128            pzfields = pzfields.split(",")
 7129        default_profile = prioritization_param.get("default_profile", None)
 7130        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
 7131        prioritization_score_mode = prioritization_param.get(
 7132            "prioritization_score_mode", "HOWARD"
 7133        )
 7134
 7135        # Quick Prioritizations
 7136        prioritizations = param.get("prioritizations", None)
 7137        if prioritizations:
 7138            log.info("Quick Prioritization:")
 7139            for profile in prioritizations.split(","):
 7140                if profile not in profiles:
 7141                    profiles.append(profile)
 7142                    log.info(f"   {profile}")
 7143
 7144        # If profile "ALL" provided, all profiles in the config profiles
 7145        if "ALL" in profiles:
 7146            profiles = list(prioritizations_config.keys())
 7147
 7148        for profile in profiles:
 7149            if prioritizations_config.get(profile, None):
 7150                log.debug(f"Profile '{profile}' configured")
 7151            else:
 7152                msg_error = f"Profile '{profile}' NOT configured"
 7153                log.error(msg_error)
 7154                raise ValueError(msg_error)
 7155
 7156        if profiles:
 7157            log.info(f"Prioritization... ")
 7158        else:
 7159            log.debug(f"No profile defined")
 7160            return False
 7161
 7162        if not default_profile and len(profiles):
 7163            default_profile = profiles[0]
 7164
 7165        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
 7166        log.debug("Profiles to check: " + str(list(profiles)))
 7167
 7168        # Variables
 7169        if table is not None:
 7170            table_variants = table
 7171        else:
 7172            table_variants = self.get_table_variants(clause="update")
 7173        log.debug(f"Table to prioritize: {table_variants}")
 7174
 7175        # Added columns
 7176        added_columns = []
 7177
 7178        # Create list of PZfields
 7179        # List of PZFields
 7180        list_of_pzfields_original = pzfields + [
 7181            pzfield + pzfields_sep + profile
 7182            for pzfield in pzfields
 7183            for profile in profiles
 7184        ]
 7185        list_of_pzfields = []
 7186        log.debug(f"{list_of_pzfields_original}")
 7187
 7188        # Remove existing PZfields to use if exists
 7189        for pzfield in list_of_pzfields_original:
 7190            if self.get_header().infos.get(pzfield, None) is None:
 7191                list_of_pzfields.append(pzfield)
 7192                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
 7193            else:
 7194                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
 7195
 7196        if list_of_pzfields:
 7197
 7198            # Explode Infos prefix
 7199            explode_infos_prefix = self.get_explode_infos_prefix()
 7200
 7201            # PZfields tags description
 7202            PZfields_INFOS = {
 7203                f"{pz_prefix}Tags": {
 7204                    "ID": f"{pz_prefix}Tags",
 7205                    "Number": ".",
 7206                    "Type": "String",
 7207                    "Description": "Variant tags based on annotation criteria",
 7208                },
 7209                f"{pz_prefix}Score": {
 7210                    "ID": f"{pz_prefix}Score",
 7211                    "Number": 1,
 7212                    "Type": "Integer",
 7213                    "Description": "Variant score based on annotation criteria",
 7214                },
 7215                f"{pz_prefix}Flag": {
 7216                    "ID": f"{pz_prefix}Flag",
 7217                    "Number": 1,
 7218                    "Type": "String",
 7219                    "Description": "Variant flag based on annotation criteria",
 7220                },
 7221                f"{pz_prefix}Comment": {
 7222                    "ID": f"{pz_prefix}Comment",
 7223                    "Number": ".",
 7224                    "Type": "String",
 7225                    "Description": "Variant comment based on annotation criteria",
 7226                },
 7227                f"{pz_prefix}Infos": {
 7228                    "ID": f"{pz_prefix}Infos",
 7229                    "Number": ".",
 7230                    "Type": "String",
 7231                    "Description": "Variant infos based on annotation criteria",
 7232                },
 7233                f"{pz_prefix}Class": {
 7234                    "ID": f"{pz_prefix}Class",
 7235                    "Number": ".",
 7236                    "Type": "String",
 7237                    "Description": "Variant class based on annotation criteria",
 7238                },
 7239            }
 7240
 7241            # Create INFO fields if not exist
 7242            for field in PZfields_INFOS:
 7243                field_ID = PZfields_INFOS[field]["ID"]
 7244                field_description = PZfields_INFOS[field]["Description"]
 7245                if field_ID not in self.get_header().infos and field_ID in pzfields:
 7246                    field_description = (
 7247                        PZfields_INFOS[field]["Description"]
 7248                        + f", profile {default_profile}"
 7249                    )
 7250                    self.get_header().infos[field_ID] = vcf.parser._Info(
 7251                        field_ID,
 7252                        PZfields_INFOS[field]["Number"],
 7253                        PZfields_INFOS[field]["Type"],
 7254                        field_description,
 7255                        "unknown",
 7256                        "unknown",
 7257                        code_type_map[PZfields_INFOS[field]["Type"]],
 7258                    )
 7259
 7260            # Create INFO fields if not exist for each profile
 7261            for profile in prioritizations_config:
 7262                if profile in profiles or profiles == []:
 7263                    for field in PZfields_INFOS:
 7264                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
 7265                        field_description = (
 7266                            PZfields_INFOS[field]["Description"]
 7267                            + f", profile {profile}"
 7268                        )
 7269                        if (
 7270                            field_ID not in self.get_header().infos
 7271                            and field in pzfields
 7272                        ):
 7273                            self.get_header().infos[field_ID] = vcf.parser._Info(
 7274                                field_ID,
 7275                                PZfields_INFOS[field]["Number"],
 7276                                PZfields_INFOS[field]["Type"],
 7277                                field_description,
 7278                                "unknown",
 7279                                "unknown",
 7280                                code_type_map[PZfields_INFOS[field]["Type"]],
 7281                            )
 7282
 7283            # Header
 7284            for pzfield in list_of_pzfields:
 7285                if re.match(f"{pz_prefix}Score.*", pzfield):
 7286                    added_column = self.add_column(
 7287                        table_name=table_variants,
 7288                        column_name=pzfield,
 7289                        column_type="INTEGER",
 7290                        default_value="0",
 7291                    )
 7292                elif re.match(f"{pz_prefix}Flag.*", pzfield):
 7293                    added_column = self.add_column(
 7294                        table_name=table_variants,
 7295                        column_name=pzfield,
 7296                        column_type="BOOLEAN",
 7297                        default_value="1",
 7298                    )
 7299                elif re.match(f"{pz_prefix}Class.*", pzfield):
 7300                    added_column = self.add_column(
 7301                        table_name=table_variants,
 7302                        column_name=pzfield,
 7303                        column_type="VARCHAR[]",
 7304                        default_value="null",
 7305                    )
 7306                else:
 7307                    added_column = self.add_column(
 7308                        table_name=table_variants,
 7309                        column_name=pzfield,
 7310                        column_type="STRING",
 7311                        default_value="''",
 7312                    )
 7313                added_columns.append(added_column)
 7314
 7315            # Profiles
 7316            if profiles:
 7317
 7318                # foreach profile in configuration file
 7319                for profile in prioritizations_config:
 7320
 7321                    # If profile is asked in param, or ALL are asked (empty profile [])
 7322                    if profile in profiles or profiles == []:
 7323                        log.info(f"Profile '{profile}'")
 7324
 7325                        sql_set_info_option = ""
 7326
 7327                        sql_set_info = []
 7328
 7329                        # PZ fields set
 7330
 7331                        # PZScore
 7332                        if (
 7333                            f"{pz_prefix}Score{pzfields_sep}{profile}"
 7334                            in list_of_pzfields
 7335                        ):
 7336                            sql_set_info.append(
 7337                                f"""
 7338                                    concat(
 7339                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
 7340                                        {pz_prefix}Score{pzfields_sep}{profile}
 7341                                    ) 
 7342                                """
 7343                            )
 7344                            if (
 7345                                profile == default_profile
 7346                                and f"{pz_prefix}Score" in list_of_pzfields
 7347                            ):
 7348                                sql_set_info.append(
 7349                                    f"""
 7350                                        concat(
 7351                                            '{pz_prefix}Score=',
 7352                                            {pz_prefix}Score{pzfields_sep}{profile}
 7353                                        )
 7354                                    """
 7355                                )
 7356
 7357                        # PZFlag
 7358                        if (
 7359                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7360                            in list_of_pzfields
 7361                        ):
 7362                            sql_set_info.append(
 7363                                f"""
 7364                                    concat(
 7365                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
 7366                                        CASE 
 7367                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7368                                            THEN 'PASS'
 7369                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7370                                            THEN 'FILTERED'
 7371                                        END
 7372                                    ) 
 7373                                """
 7374                            )
 7375                            if (
 7376                                profile == default_profile
 7377                                and f"{pz_prefix}Flag" in list_of_pzfields
 7378                            ):
 7379                                sql_set_info.append(
 7380                                    f"""
 7381                                        concat(
 7382                                            '{pz_prefix}Flag=',
 7383                                            CASE 
 7384                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7385                                                THEN 'PASS'
 7386                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7387                                                THEN 'FILTERED'
 7388                                            END
 7389                                        )
 7390                                    """
 7391                                )
 7392
 7393                        # PZClass
 7394                        if (
 7395                            f"{pz_prefix}Class{pzfields_sep}{profile}"
 7396                            in list_of_pzfields
 7397                        ):
 7398                            sql_set_info.append(
 7399                                f"""
 7400                                    concat(
 7401                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
 7402                                        CASE
 7403                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7404                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7405                                            ELSE '.'
 7406                                        END 
 7407                                    )
 7408                                    
 7409                                """
 7410                            )
 7411                            if (
 7412                                profile == default_profile
 7413                                and f"{pz_prefix}Class" in list_of_pzfields
 7414                            ):
 7415                                sql_set_info.append(
 7416                                    f"""
 7417                                        concat(
 7418                                            '{pz_prefix}Class=',
 7419                                            CASE
 7420                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7421                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7422                                                ELSE '.'
 7423                                            END 
 7424                                        )
 7425                                    """
 7426                                )
 7427
 7428                        # PZComment
 7429                        if (
 7430                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7431                            in list_of_pzfields
 7432                        ):
 7433                            sql_set_info.append(
 7434                                f"""
 7435                                    CASE
 7436                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7437                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
 7438                                        ELSE ''
 7439                                    END
 7440                                """
 7441                            )
 7442                            if (
 7443                                profile == default_profile
 7444                                and f"{pz_prefix}Comment" in list_of_pzfields
 7445                            ):
 7446                                sql_set_info.append(
 7447                                    f"""
 7448                                        CASE
 7449                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7450                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
 7451                                            ELSE ''
 7452                                        END
 7453                                    """
 7454                                )
 7455
 7456                        # PZInfos
 7457                        if (
 7458                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7459                            in list_of_pzfields
 7460                        ):
 7461                            sql_set_info.append(
 7462                                f"""
 7463                                    CASE
 7464                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7465                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
 7466                                        ELSE ''
 7467                                    END
 7468                                """
 7469                            )
 7470                            if (
 7471                                profile == default_profile
 7472                                and f"{pz_prefix}Infos" in list_of_pzfields
 7473                            ):
 7474                                sql_set_info.append(
 7475                                    f"""
 7476                                        CASE
 7477                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7478                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
 7479                                            ELSE ''
 7480                                        END
 7481                                    """
 7482                                )
 7483
 7484                        # Merge PZfields
 7485                        sql_set_info_option = ""
 7486                        sql_set_sep = ""
 7487                        for sql_set in sql_set_info:
 7488                            if sql_set_sep:
 7489                                sql_set_info_option += f"""
 7490                                    , concat('{sql_set_sep}', {sql_set})
 7491                                """
 7492                            else:
 7493                                sql_set_info_option += f"""
 7494                                    , {sql_set}
 7495                                """
 7496                            sql_set_sep = ";"
 7497
 7498                        sql_queries = []
 7499                        for annotation in prioritizations_config[profile]:
 7500
 7501                            # skip special sections
 7502                            if annotation.startswith("_"):
 7503                                continue
 7504
 7505                            # For each criterions
 7506                            for criterion in prioritizations_config[profile][
 7507                                annotation
 7508                            ]:
 7509
 7510                                # Criterion mode
 7511                                criterion_mode = None
 7512                                if np.any(
 7513                                    np.isin(list(criterion.keys()), ["type", "value"])
 7514                                ):
 7515                                    criterion_mode = "operation"
 7516                                elif np.any(
 7517                                    np.isin(list(criterion.keys()), ["sql", "fields"])
 7518                                ):
 7519                                    criterion_mode = "sql"
 7520                                log.debug(f"Criterion Mode: {criterion_mode}")
 7521
 7522                                # Criterion parameters
 7523                                criterion_type = criterion.get("type", None)
 7524                                criterion_value = criterion.get("value", None)
 7525                                criterion_sql = criterion.get("sql", None)
 7526                                criterion_fields = criterion.get("fields", None)
 7527                                criterion_score = criterion.get("score", 0)
 7528                                criterion_flag = criterion.get("flag", "PASS")
 7529                                criterion_class = criterion.get("class", None)
 7530                                criterion_flag_bool = criterion_flag == "PASS"
 7531                                criterion_comment = (
 7532                                    ", ".join(criterion.get("comment", []))
 7533                                    .replace("'", "''")
 7534                                    .replace(";", ",")
 7535                                    .replace("\t", " ")
 7536                                )
 7537                                criterion_infos = (
 7538                                    str(criterion)
 7539                                    .replace("'", "''")
 7540                                    .replace(";", ",")
 7541                                    .replace("\t", " ")
 7542                                )
 7543
 7544                                # SQL
 7545                                if criterion_sql is not None and isinstance(
 7546                                    criterion_sql, list
 7547                                ):
 7548                                    criterion_sql = " ".join(criterion_sql)
 7549
 7550                                # Fields and explode
 7551                                if criterion_fields is None:
 7552                                    criterion_fields = [annotation]
 7553                                if not isinstance(criterion_fields, list):
 7554                                    criterion_fields = str(criterion_fields).split(",")
 7555
 7556                                # Class
 7557                                if criterion_class is not None and not isinstance(
 7558                                    criterion_class, list
 7559                                ):
 7560                                    criterion_class = str(criterion_class).split(",")
 7561
 7562                                for annotation_field in criterion_fields:
 7563
 7564                                    # Explode specific annotation
 7565                                    log.debug(
 7566                                        f"Explode annotation '{annotation_field}'"
 7567                                    )
 7568                                    added_columns += self.explode_infos(
 7569                                        prefix=explode_infos_prefix,
 7570                                        fields=[annotation_field],
 7571                                        table=table_variants,
 7572                                    )
 7573                                    extra_infos = self.get_extra_infos(
 7574                                        table=table_variants
 7575                                    )
 7576
 7577                                    # Check if annotation field is present
 7578                                    if (
 7579                                        f"{explode_infos_prefix}{annotation_field}"
 7580                                        not in extra_infos
 7581                                    ):
 7582                                        msq_err = f"Annotation '{annotation_field}' not in data"
 7583                                        log.error(msq_err)
 7584                                        raise ValueError(msq_err)
 7585                                    else:
 7586                                        log.debug(
 7587                                            f"Annotation '{annotation_field}' in data"
 7588                                        )
 7589
 7590                                sql_set = []
 7591                                sql_set_info = []
 7592
 7593                                # PZ fields set
 7594
 7595                                # PZScore
 7596                                if (
 7597                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
 7598                                    in list_of_pzfields
 7599                                ):
 7600                                    # VaRank prioritization score mode
 7601                                    if prioritization_score_mode.upper().strip() in ["VARANK", "MAX", "MAXIMUM", "TOP"]:
 7602                                        sql_set.append(
 7603                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END "
 7604                                        )
 7605                                    # default HOWARD prioritization score mode
 7606                                    else:
 7607                                        sql_set.append(
 7608                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7609                                        )
 7610
 7611                                # PZFlag
 7612                                if (
 7613                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7614                                    in list_of_pzfields
 7615                                ):
 7616                                    sql_set.append(
 7617                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
 7618                                    )
 7619
 7620                                # PZClass
 7621                                if (
 7622                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
 7623                                    in list_of_pzfields
 7624                                    and criterion_class is not None
 7625                                ):
 7626                                    sql_set.append(
 7627                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
 7628                                    )
 7629
 7630                                # PZComment
 7631                                if (
 7632                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7633                                    in list_of_pzfields
 7634                                ):
 7635                                    sql_set.append(
 7636                                        f"""
 7637                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
 7638                                                concat(
 7639                                                    {pz_prefix}Comment{pzfields_sep}{profile},
 7640                                                    CASE 
 7641                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
 7642                                                        THEN ', '
 7643                                                        ELSE ''
 7644                                                    END,
 7645                                                    '{criterion_comment}'
 7646                                                )
 7647                                        """
 7648                                    )
 7649
 7650                                # PZInfos
 7651                                if (
 7652                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7653                                    in list_of_pzfields
 7654                                ):
 7655                                    sql_set.append(
 7656                                        f"""
 7657                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
 7658                                                concat(
 7659                                                    {pz_prefix}Infos{pzfields_sep}{profile},
 7660                                                    '{criterion_infos}'
 7661                                                )
 7662                                        """
 7663                                    )
 7664                                sql_set_option = ",".join(sql_set)
 7665
 7666                                # Criterion and comparison
 7667                                if sql_set_option:
 7668
 7669                                    if criterion_mode in ["operation"]:
 7670
 7671                                        try:
 7672                                            float(criterion_value)
 7673                                            sql_update = f"""
 7674                                                UPDATE {table_variants}
 7675                                                SET {sql_set_option}
 7676                                                WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
 7677                                                AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
 7678                                            """
 7679                                        except:
 7680                                            contains_option = ""
 7681                                            if criterion_type == "contains":
 7682                                                contains_option = ".*"
 7683                                            sql_update = f"""
 7684                                                UPDATE {table_variants}
 7685                                                SET {sql_set_option}
 7686                                                WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
 7687                                            """
 7688                                        sql_queries.append(sql_update)
 7689
 7690                                    elif criterion_mode in ["sql"]:
 7691
 7692                                        sql_update = f"""
 7693                                            UPDATE {table_variants}
 7694                                            SET {sql_set_option}
 7695                                            WHERE {criterion_sql}
 7696                                        """
 7697                                        sql_queries.append(sql_update)
 7698
 7699                                    else:
 7700                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
 7701                                        log.error(msg_err)
 7702                                        raise ValueError(msg_err)
 7703
 7704                                else:
 7705                                    log.warning(
 7706                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
 7707                                    )
 7708
 7709                        # PZTags
 7710                        if (
 7711                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
 7712                            in list_of_pzfields
 7713                        ):
 7714
 7715                            # Create PZFalgs value
 7716                            pztags_value = ""
 7717                            pztags_sep_default = ","
 7718                            pztags_sep = ""
 7719                            for pzfield in pzfields:
 7720                                if pzfield not in [f"{pz_prefix}Tags"]:
 7721                                    if (
 7722                                        f"{pzfield}{pzfields_sep}{profile}"
 7723                                        in list_of_pzfields
 7724                                    ):
 7725                                        if pzfield in [f"{pz_prefix}Flag"]:
 7726                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7727                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
 7728                                                    THEN 'PASS'
 7729                                                    ELSE 'FILTERED'
 7730                                                END, '"""
 7731                                        elif pzfield in [f"{pz_prefix}Class"]:
 7732                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7733                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7734                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7735                                                    ELSE '.'
 7736                                                END, '"""
 7737                                        else:
 7738                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
 7739                                        pztags_sep = pztags_sep_default
 7740
 7741                            # Add Query update for PZFlags
 7742                            sql_update_pztags = f"""
 7743                                UPDATE {table_variants}
 7744                                SET INFO = concat(
 7745                                        INFO,
 7746                                        CASE WHEN INFO NOT in ('','.')
 7747                                                THEN ';'
 7748                                                ELSE ''
 7749                                        END,
 7750                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
 7751                                    )
 7752                                """
 7753                            sql_queries.append(sql_update_pztags)
 7754
 7755                            # Add Query update for PZFlags for default
 7756                            if profile == default_profile:
 7757                                sql_update_pztags_default = f"""
 7758                                UPDATE {table_variants}
 7759                                SET INFO = concat(
 7760                                        INFO,
 7761                                        ';',
 7762                                        '{pz_prefix}Tags={pztags_value}'
 7763                                    )
 7764                                """
 7765                                sql_queries.append(sql_update_pztags_default)
 7766
 7767                        log.info(f"""Profile '{profile}' - Prioritization... """)
 7768
 7769                        if sql_queries:
 7770
 7771                            for sql_query in sql_queries:
 7772                                log.debug(
 7773                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
 7774                                )
 7775                                self.conn.execute(sql_query)
 7776
 7777                        log.info(f"""Profile '{profile}' - Update... """)
 7778                        sql_query_update = f"""
 7779                            UPDATE {table_variants}
 7780                            SET INFO =  
 7781                                concat(
 7782                                    CASE
 7783                                        WHEN INFO NOT IN ('','.')
 7784                                        THEN concat(INFO, ';')
 7785                                        ELSE ''
 7786                                    END
 7787                                    {sql_set_info_option}
 7788                                )
 7789                        """
 7790                        self.conn.execute(sql_query_update)
 7791
 7792        else:
 7793
 7794            log.warning(f"No profiles in parameters")
 7795
 7796        # Remove added columns
 7797        for added_column in added_columns:
 7798            self.drop_column(column=added_column)
 7799
 7800        # Explode INFOS fields into table fields
 7801        if self.get_explode_infos():
 7802            self.explode_infos(
 7803                prefix=self.get_explode_infos_prefix(),
 7804                fields=self.get_explode_infos_fields(),
 7805                force=True,
 7806            )
 7807
 7808        return True
 7809
 7810    ###
 7811    # HGVS
 7812    ###
 7813
 7814    def annotation_hgvs(self, threads: int = None) -> None:
 7815        """
 7816        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
 7817        coordinates and alleles.
 7818
 7819        :param threads: The `threads` parameter is an optional integer that specifies the number of
 7820        threads to use for parallel processing. If no value is provided, it will default to the number
 7821        of threads obtained from the `get_threads()` method
 7822        :type threads: int
 7823        """
 7824
 7825        # Function for each partition of the Dask Dataframe
 7826        def partition_function(partition):
 7827            """
 7828            The function `partition_function` applies the `annotation_hgvs_partition` function to
 7829            each row of a DataFrame called `partition`.
 7830
 7831            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
 7832            to be processed
 7833            :return: the result of applying the "annotation_hgvs_partition" function to each row of
 7834            the "partition" dataframe along the axis 1.
 7835            """
 7836            return partition.apply(annotation_hgvs_partition, axis=1)
 7837
 7838        def annotation_hgvs_partition(row) -> str:
 7839            """
 7840            The function `annotation_hgvs_partition` takes in a row of data and returns a string
 7841            containing a list of HGVS names associated with the given genomic coordinates and alleles.
 7842
 7843            :param row: A dictionary-like object that contains the values for the following keys:
 7844            :return: a string that contains the HGVS names associated with the given row of data.
 7845            """
 7846
 7847            chr = row["CHROM"]
 7848            pos = row["POS"]
 7849            ref = row["REF"]
 7850            alt = row["ALT"]
 7851
 7852            # Find list of associated transcripts
 7853            transcripts_list = list(
 7854                polars_conn.execute(
 7855                    f"""
 7856                SELECT transcript
 7857                FROM refseq_df
 7858                WHERE CHROM='{chr}'
 7859                AND POS={pos}
 7860            """
 7861                )["transcript"]
 7862            )
 7863
 7864            # Full HGVS annotation in list
 7865            hgvs_full_list = []
 7866
 7867            for transcript_name in transcripts_list:
 7868
 7869                # Transcript
 7870                transcript = get_transcript(
 7871                    transcripts=transcripts, transcript_name=transcript_name
 7872                )
 7873                # Exon
 7874                if use_exon:
 7875                    exon = transcript.find_exon_number(pos)
 7876                else:
 7877                    exon = None
 7878                # Protein
 7879                transcript_protein = None
 7880                if use_protein or add_protein or full_format:
 7881                    transcripts_protein = list(
 7882                        polars_conn.execute(
 7883                            f"""
 7884                        SELECT protein
 7885                        FROM refseqlink_df
 7886                        WHERE transcript='{transcript_name}'
 7887                        LIMIT 1
 7888                    """
 7889                        )["protein"]
 7890                    )
 7891                    if len(transcripts_protein):
 7892                        transcript_protein = transcripts_protein[0]
 7893
 7894                # HGVS name
 7895                hgvs_name = format_hgvs_name(
 7896                    chr,
 7897                    pos,
 7898                    ref,
 7899                    alt,
 7900                    genome=genome,
 7901                    transcript=transcript,
 7902                    transcript_protein=transcript_protein,
 7903                    exon=exon,
 7904                    use_gene=use_gene,
 7905                    use_protein=use_protein,
 7906                    full_format=full_format,
 7907                    use_version=use_version,
 7908                    codon_type=codon_type,
 7909                )
 7910                hgvs_full_list.append(hgvs_name)
 7911                if add_protein and not use_protein and not full_format:
 7912                    hgvs_name = format_hgvs_name(
 7913                        chr,
 7914                        pos,
 7915                        ref,
 7916                        alt,
 7917                        genome=genome,
 7918                        transcript=transcript,
 7919                        transcript_protein=transcript_protein,
 7920                        exon=exon,
 7921                        use_gene=use_gene,
 7922                        use_protein=True,
 7923                        full_format=False,
 7924                        use_version=use_version,
 7925                        codon_type=codon_type,
 7926                    )
 7927                    hgvs_full_list.append(hgvs_name)
 7928
 7929            # Create liste of HGVS annotations
 7930            hgvs_full = ",".join(hgvs_full_list)
 7931
 7932            return hgvs_full
 7933
 7934        # Polars connexion
 7935        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7936
 7937        # Config
 7938        config = self.get_config()
 7939
 7940        # Databases
 7941        # Genome
 7942        databases_genomes_folders = (
 7943            config.get("folders", {})
 7944            .get("databases", {})
 7945            .get("genomes", DEFAULT_GENOME_FOLDER)
 7946        )
 7947        databases_genome = (
 7948            config.get("folders", {}).get("databases", {}).get("genomes", "")
 7949        )
 7950        # refseq database folder
 7951        databases_refseq_folders = (
 7952            config.get("folders", {})
 7953            .get("databases", {})
 7954            .get("refseq", DEFAULT_REFSEQ_FOLDER)
 7955        )
 7956        # refseq
 7957        databases_refseq = config.get("databases", {}).get("refSeq", None)
 7958        # refSeqLink
 7959        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
 7960
 7961        # Param
 7962        param = self.get_param()
 7963
 7964        # Quick HGVS
 7965        if "hgvs_options" in param and param.get("hgvs_options", ""):
 7966            log.info(f"Quick HGVS Annotation:")
 7967            if not param.get("hgvs", None):
 7968                param["hgvs"] = {}
 7969            for option in param.get("hgvs_options", "").split(","):
 7970                option_var_val = option.split("=")
 7971                option_var = option_var_val[0]
 7972                if len(option_var_val) > 1:
 7973                    option_val = option_var_val[1]
 7974                else:
 7975                    option_val = "True"
 7976                if option_val.upper() in ["TRUE"]:
 7977                    option_val = True
 7978                elif option_val.upper() in ["FALSE"]:
 7979                    option_val = False
 7980                log.info(f"   {option_var}={option_val}")
 7981                param["hgvs"][option_var] = option_val
 7982
 7983        # Check if HGVS annotation enabled
 7984        if "hgvs" in param:
 7985            log.info(f"HGVS Annotation... ")
 7986            for hgvs_option in param.get("hgvs", {}):
 7987                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
 7988        else:
 7989            return
 7990
 7991        # HGVS Param
 7992        param_hgvs = param.get("hgvs", {})
 7993        use_exon = param_hgvs.get("use_exon", False)
 7994        use_gene = param_hgvs.get("use_gene", False)
 7995        use_protein = param_hgvs.get("use_protein", False)
 7996        add_protein = param_hgvs.get("add_protein", False)
 7997        full_format = param_hgvs.get("full_format", False)
 7998        use_version = param_hgvs.get("use_version", False)
 7999        codon_type = param_hgvs.get("codon_type", "3")
 8000
 8001        # refSseq refSeqLink
 8002        databases_refseq = param_hgvs.get("refseq", databases_refseq)
 8003        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
 8004
 8005        # Assembly
 8006        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 8007
 8008        # Genome
 8009        genome_file = None
 8010        if find_genome(databases_genome):
 8011            genome_file = find_genome(databases_genome)
 8012        else:
 8013            genome_file = find_genome(
 8014                genome_path=databases_genomes_folders, assembly=assembly
 8015            )
 8016        log.debug("Genome: " + str(genome_file))
 8017
 8018        # refSseq
 8019        refseq_file = find_file_prefix(
 8020            input_file=databases_refseq,
 8021            prefix="ncbiRefSeq",
 8022            folder=databases_refseq_folders,
 8023            assembly=assembly,
 8024        )
 8025        log.debug("refSeq: " + str(refseq_file))
 8026
 8027        # refSeqLink
 8028        refseqlink_file = find_file_prefix(
 8029            input_file=databases_refseqlink,
 8030            prefix="ncbiRefSeqLink",
 8031            folder=databases_refseq_folders,
 8032            assembly=assembly,
 8033        )
 8034        log.debug("refSeqLink: " + str(refseqlink_file))
 8035
 8036        # Threads
 8037        if not threads:
 8038            threads = self.get_threads()
 8039        log.debug("Threads: " + str(threads))
 8040
 8041        # Variables
 8042        table_variants = self.get_table_variants(clause="update")
 8043
 8044        # Get variants SNV and InDel only
 8045        query_variants = f"""
 8046            SELECT "#CHROM" AS CHROM, POS, REF, ALT
 8047            FROM {table_variants}
 8048            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
 8049            """
 8050        df_variants = self.get_query_to_df(query_variants)
 8051
 8052        # Added columns
 8053        added_columns = []
 8054
 8055        # Add hgvs column in variants table
 8056        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
 8057        added_column = self.add_column(
 8058            table_variants, hgvs_column_name, "STRING", default_value=None
 8059        )
 8060        added_columns.append(added_column)
 8061
 8062        log.debug(f"refSeq loading...")
 8063        # refSeq in duckDB
 8064        refseq_table = get_refseq_table(
 8065            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
 8066        )
 8067        # Loading all refSeq in Dataframe
 8068        refseq_query = f"""
 8069            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
 8070            FROM {refseq_table}
 8071            JOIN df_variants ON (
 8072                {refseq_table}.chrom = df_variants.CHROM
 8073                AND {refseq_table}.txStart<=df_variants.POS
 8074                AND {refseq_table}.txEnd>=df_variants.POS
 8075            )
 8076        """
 8077        refseq_df = self.conn.query(refseq_query).pl()
 8078
 8079        if refseqlink_file:
 8080            log.debug(f"refSeqLink loading...")
 8081            # refSeqLink in duckDB
 8082            refseqlink_table = get_refseq_table(
 8083                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
 8084            )
 8085            # Loading all refSeqLink in Dataframe
 8086            protacc_column = "protAcc_with_ver"
 8087            mrnaacc_column = "mrnaAcc_with_ver"
 8088            refseqlink_query = f"""
 8089                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
 8090                FROM {refseqlink_table} 
 8091                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
 8092                WHERE protAcc_without_ver IS NOT NULL
 8093            """
 8094            # Polars Dataframe
 8095            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
 8096
 8097        # Read RefSeq transcripts into a python dict/model.
 8098        log.debug(f"Transcripts loading...")
 8099        with tempfile.TemporaryDirectory() as tmpdir:
 8100            transcripts_query = f"""
 8101                COPY (
 8102                    SELECT {refseq_table}.*
 8103                    FROM {refseq_table}
 8104                    JOIN df_variants ON (
 8105                        {refseq_table}.chrom=df_variants.CHROM
 8106                        AND {refseq_table}.txStart<=df_variants.POS
 8107                        AND {refseq_table}.txEnd>=df_variants.POS
 8108                    )
 8109                )
 8110                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
 8111            """
 8112            self.conn.query(transcripts_query)
 8113            with open(f"{tmpdir}/transcript.tsv") as infile:
 8114                transcripts = read_transcripts(infile)
 8115
 8116        # Polars connexion
 8117        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 8118
 8119        log.debug("Genome loading...")
 8120        # Read genome sequence using pyfaidx.
 8121        genome = Fasta(genome_file)
 8122
 8123        log.debug("Start annotation HGVS...")
 8124
 8125        # Create
 8126        # a Dask Dataframe from Pandas dataframe with partition as number of threads
 8127        ddf = dd.from_pandas(df_variants, npartitions=threads)
 8128
 8129        # Use dask.dataframe.apply() to apply function on each partition
 8130        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
 8131
 8132        # Convert Dask DataFrame to Pandas Dataframe
 8133        df = ddf.compute()
 8134
 8135        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
 8136        with tempfile.TemporaryDirectory() as tmpdir:
 8137            df_parquet = os.path.join(tmpdir, "df.parquet")
 8138            df.to_parquet(df_parquet)
 8139
 8140            # Update hgvs column
 8141            update_variant_query = f"""
 8142                UPDATE {table_variants}
 8143                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
 8144                FROM read_parquet('{df_parquet}') as df
 8145                WHERE variants."#CHROM" = df.CHROM
 8146                AND variants.POS = df.POS
 8147                AND variants.REF = df.REF
 8148                AND variants.ALT = df.ALT
 8149                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
 8150                """
 8151            self.execute_query(update_variant_query)
 8152
 8153        # Update INFO column
 8154        sql_query_update = f"""
 8155            UPDATE {table_variants}
 8156            SET INFO = 
 8157                concat(
 8158                    CASE 
 8159                        WHEN INFO NOT IN ('','.')
 8160                        THEN concat(INFO, ';')
 8161                        ELSE ''
 8162                    END,
 8163                    'hgvs=',
 8164                    {hgvs_column_name}
 8165                )
 8166            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
 8167            """
 8168        self.execute_query(sql_query_update)
 8169
 8170        # Add header
 8171        HGVS_INFOS = {
 8172            "hgvs": {
 8173                "ID": "hgvs",
 8174                "Number": ".",
 8175                "Type": "String",
 8176                "Description": f"HGVS annotatation with HOWARD",
 8177            }
 8178        }
 8179
 8180        for field in HGVS_INFOS:
 8181            field_ID = HGVS_INFOS[field]["ID"]
 8182            field_description = HGVS_INFOS[field]["Description"]
 8183            self.get_header().infos[field_ID] = vcf.parser._Info(
 8184                field_ID,
 8185                HGVS_INFOS[field]["Number"],
 8186                HGVS_INFOS[field]["Type"],
 8187                field_description,
 8188                "unknown",
 8189                "unknown",
 8190                code_type_map[HGVS_INFOS[field]["Type"]],
 8191            )
 8192
 8193        # Remove added columns
 8194        for added_column in added_columns:
 8195            self.drop_column(column=added_column)
 8196
 8197    ###
 8198    # Calculation
 8199    ###
 8200
 8201    def get_operations_help(
 8202        self, operations_config_dict: dict = {}, operations_config_file: str = None
 8203    ) -> list:
 8204
 8205        # Init
 8206        operations_help = []
 8207
 8208        # operations
 8209        operations = self.get_config_json(
 8210            name="calculations",
 8211            config_dict=operations_config_dict,
 8212            config_file=operations_config_file,
 8213        )
 8214        for op in operations:
 8215            op_name = operations[op].get("name", op).upper()
 8216            op_description = operations[op].get("description", op_name)
 8217            op_available = operations[op].get("available", False)
 8218            if op_available:
 8219                operations_help.append(f"   {op_name}: {op_description}")
 8220
 8221        # Sort operations
 8222        operations_help.sort()
 8223
 8224        # insert header
 8225        operations_help.insert(0, "Available calculation operations:")
 8226
 8227        # Return
 8228        return operations_help
 8229
 8230    def calculation(
 8231        self,
 8232        operations: dict = {},
 8233        operations_config_dict: dict = {},
 8234        operations_config_file: str = None,
 8235    ) -> None:
 8236        """
 8237        It takes a list of operations, and for each operation, it checks if it's a python or sql
 8238        operation, and then calls the appropriate function
 8239
 8240        param json example:
 8241            "calculation": {
 8242                "NOMEN": {
 8243                    "options": {
 8244                        "hgvs_field": "hgvs"
 8245                    },
 8246                "middle" : null
 8247            }
 8248        """
 8249
 8250        # Param
 8251        param = self.get_param()
 8252
 8253        # CHeck operations config file
 8254        if operations_config_file is None:
 8255            operations_config_file = param.get("calculation", {}).get(
 8256                "calculation_config", None
 8257            )
 8258
 8259        # operations config
 8260        operations_config = self.get_config_json(
 8261            name="calculations",
 8262            config_dict=operations_config_dict,
 8263            config_file=operations_config_file,
 8264        )
 8265
 8266        # Upper keys
 8267        operations_config = {k.upper(): v for k, v in operations_config.items()}
 8268
 8269        # Calculations
 8270
 8271        # Operations from param
 8272        operations = param.get("calculation", {}).get("calculations", operations)
 8273
 8274        # Quick calculation - add
 8275        if param.get("calculations", None):
 8276
 8277            # List of operations
 8278            calculations_list = [
 8279                value.strip() for value in param.get("calculations", "").split(",")
 8280            ]
 8281
 8282            # Log
 8283            log.info(f"Quick Calculations:")
 8284            for calculation_key in calculations_list:
 8285                log.info(f"   {calculation_key}")
 8286
 8287            # Create tmp operations (to keep operation order)
 8288            operations_tmp = {}
 8289            for calculation_operation in calculations_list:
 8290                if calculation_operation.upper() not in operations_tmp:
 8291                    log.debug(
 8292                        f"{calculation_operation}.upper() not in {operations_tmp}"
 8293                    )
 8294                    operations_tmp[calculation_operation.upper()] = {}
 8295                    add_value_into_dict(
 8296                        dict_tree=operations_tmp,
 8297                        sections=[
 8298                            calculation_operation.upper(),
 8299                        ],
 8300                        value=operations.get(calculation_operation.upper(), {}),
 8301                    )
 8302            # Add operations already in param
 8303            for calculation_operation in operations:
 8304                if calculation_operation not in operations_tmp:
 8305                    operations_tmp[calculation_operation] = operations.get(
 8306                        calculation_operation, {}
 8307                    )
 8308
 8309            # Update operations in param
 8310            operations = operations_tmp
 8311
 8312        # Operations for calculation
 8313        if not operations:
 8314            operations = param.get("calculation", {}).get("calculations", {})
 8315
 8316        if operations:
 8317            log.info(f"Calculations...")
 8318
 8319        # For each operations
 8320        for operation_name in operations:
 8321            operation_name = operation_name.upper()
 8322            if operation_name not in [""]:
 8323                if operation_name in operations_config:
 8324                    log.info(f"Calculation '{operation_name}'")
 8325                    operation = operations_config[operation_name]
 8326                    operation_type = operation.get("type", "sql")
 8327                    if operation_type == "python":
 8328                        self.calculation_process_function(
 8329                            operation=operation, operation_name=operation_name
 8330                        )
 8331                    elif operation_type == "sql":
 8332                        self.calculation_process_sql(
 8333                            operation=operation, operation_name=operation_name
 8334                        )
 8335                    else:
 8336                        log.error(
 8337                            f"Operations config: Type '{operation_type}' NOT available"
 8338                        )
 8339                        raise ValueError(
 8340                            f"Operations config: Type '{operation_type}' NOT available"
 8341                        )
 8342                else:
 8343                    log.error(
 8344                        f"Operations config: Calculation '{operation_name}' NOT available"
 8345                    )
 8346                    raise ValueError(
 8347                        f"Operations config: Calculation '{operation_name}' NOT available"
 8348                    )
 8349
 8350        # Explode INFOS fields into table fields
 8351        if self.get_explode_infos():
 8352            self.explode_infos(
 8353                prefix=self.get_explode_infos_prefix(),
 8354                fields=self.get_explode_infos_fields(),
 8355                force=True,
 8356            )
 8357
 8358    def calculation_process_sql(
 8359        self, operation: dict, operation_name: str = "unknown"
 8360    ) -> None:
 8361        """
 8362        The `calculation_process_sql` function takes in a mathematical operation as a string and
 8363        performs the operation, updating the specified table with the result.
 8364
 8365        :param operation: The `operation` parameter is a dictionary that contains information about the
 8366        mathematical operation to be performed. It includes the following keys:
 8367        :type operation: dict
 8368        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8369        the mathematical operation being performed. It is used for logging and error handling purposes,
 8370        defaults to unknown
 8371        :type operation_name: str (optional)
 8372        """
 8373
 8374        # Operation infos
 8375        operation_name = operation.get("name", "unknown")
 8376        log.debug(f"process SQL {operation_name}")
 8377        output_column_name = operation.get("output_column_name", operation_name)
 8378        output_column_type = operation.get("output_column_type", "String")
 8379        prefix = operation.get("explode_infos_prefix", "")
 8380        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
 8381        output_column_description = operation.get(
 8382            "output_column_description", f"{operation_name} operation"
 8383        )
 8384        operation_query = operation.get("operation_query", None)
 8385        if isinstance(operation_query, list):
 8386            operation_query = " ".join(operation_query)
 8387        operation_info_fields = operation.get("info_fields", [])
 8388        operation_info_fields_check = operation.get("info_fields_check", False)
 8389        operation_info = operation.get("operation_info", True)
 8390        operation_table = operation.get(
 8391            "table", self.get_table_variants(clause="alter")
 8392        )
 8393
 8394        # table variants
 8395        if operation_table:
 8396            table_variants = operation_table
 8397        else:
 8398            table_variants = self.get_table_variants(clause="alter")
 8399
 8400        if operation_query:
 8401
 8402            # Info fields check
 8403            operation_info_fields_check_result = True
 8404            if operation_info_fields_check:
 8405                header_infos = self.get_header().infos
 8406                for info_field in operation_info_fields:
 8407                    operation_info_fields_check_result = (
 8408                        operation_info_fields_check_result
 8409                        and info_field in header_infos
 8410                    )
 8411
 8412            # If info fields available
 8413            if operation_info_fields_check_result:
 8414
 8415                # Added_columns
 8416                added_columns = []
 8417
 8418                # Create VCF header field
 8419                vcf_reader = self.get_header()
 8420                vcf_reader.infos[output_column_name] = vcf.parser._Info(
 8421                    output_column_name,
 8422                    ".",
 8423                    output_column_type,
 8424                    output_column_description,
 8425                    "howard calculation",
 8426                    "0",
 8427                    self.code_type_map.get(output_column_type),
 8428                )
 8429
 8430                # Explode infos if needed
 8431                log.debug(f"calculation_process_sql prefix {prefix}")
 8432                added_columns += self.explode_infos(
 8433                    prefix=prefix,
 8434                    fields=[output_column_name] + operation_info_fields,
 8435                    force=False,
 8436                    table=table_variants,
 8437                )
 8438
 8439                # Create column
 8440                added_column = self.add_column(
 8441                    table_name=table_variants,
 8442                    column_name=prefix + output_column_name,
 8443                    column_type=output_column_type_sql,
 8444                    default_value="null",
 8445                )
 8446                added_columns.append(added_column)
 8447
 8448                # Operation calculation
 8449                try:
 8450
 8451                    # Query to update calculation column
 8452                    sql_update = f"""
 8453                        UPDATE {table_variants}
 8454                        SET "{prefix}{output_column_name}" = ({operation_query})
 8455                    """
 8456                    self.conn.execute(sql_update)
 8457
 8458                    # Add to INFO
 8459                    if operation_info:
 8460                        sql_update_info = f"""
 8461                            UPDATE {table_variants}
 8462                            SET "INFO" =
 8463                                concat(
 8464                                    CASE
 8465                                        WHEN "INFO" IS NOT NULL
 8466                                        THEN concat("INFO", ';')
 8467                                        ELSE ''
 8468                                    END,
 8469                                    '{output_column_name}=',
 8470                                    "{prefix}{output_column_name}"
 8471                                )
 8472                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
 8473                        """
 8474                        self.conn.execute(sql_update_info)
 8475
 8476                except:
 8477                    log.error(
 8478                        f"Operations config: Calculation '{operation_name}' query failed"
 8479                    )
 8480                    raise ValueError(
 8481                        f"Operations config: Calculation '{operation_name}' query failed"
 8482                    )
 8483
 8484                # Remove added columns
 8485                for added_column in added_columns:
 8486                    log.debug(f"added_column: {added_column}")
 8487                    self.drop_column(column=added_column)
 8488
 8489            else:
 8490                log.error(
 8491                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8492                )
 8493                raise ValueError(
 8494                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8495                )
 8496
 8497        else:
 8498            log.error(
 8499                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8500            )
 8501            raise ValueError(
 8502                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8503            )
 8504
 8505    def calculation_process_function(
 8506        self, operation: dict, operation_name: str = "unknown"
 8507    ) -> None:
 8508        """
 8509        The `calculation_process_function` takes in an operation dictionary and performs the specified
 8510        function with the given parameters.
 8511
 8512        :param operation: The `operation` parameter is a dictionary that contains information about the
 8513        operation to be performed. It has the following keys:
 8514        :type operation: dict
 8515        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8516        the operation being performed. It is used for logging purposes, defaults to unknown
 8517        :type operation_name: str (optional)
 8518        """
 8519
 8520        operation_name = operation["name"]
 8521        log.debug(f"process Python {operation_name}")
 8522        function_name = operation["function_name"]
 8523        function_params = operation["function_params"]
 8524        getattr(self, function_name)(*function_params)
 8525
 8526    def calculation_variant_id(self) -> None:
 8527        """
 8528        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
 8529        updates the INFO field of a variants table with the variant ID.
 8530        """
 8531
 8532        # variant_id annotation field
 8533        variant_id_tag = self.get_variant_id_column()
 8534        added_columns = [variant_id_tag]
 8535
 8536        # variant_id hgvs tags"
 8537        vcf_infos_tags = {
 8538            variant_id_tag: "howard variant ID annotation",
 8539        }
 8540
 8541        # Variants table
 8542        table_variants = self.get_table_variants()
 8543
 8544        # Header
 8545        vcf_reader = self.get_header()
 8546
 8547        # Add variant_id to header
 8548        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
 8549            variant_id_tag,
 8550            ".",
 8551            "String",
 8552            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
 8553            "howard calculation",
 8554            "0",
 8555            self.code_type_map.get("String"),
 8556        )
 8557
 8558        # Update
 8559        sql_update = f"""
 8560            UPDATE {table_variants}
 8561            SET "INFO" = 
 8562                concat(
 8563                    CASE
 8564                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8565                        THEN ''
 8566                        ELSE concat("INFO", ';')
 8567                    END,
 8568                    '{variant_id_tag}=',
 8569                    "{variant_id_tag}"
 8570                )
 8571        """
 8572        self.conn.execute(sql_update)
 8573
 8574        # Remove added columns
 8575        for added_column in added_columns:
 8576            self.drop_column(column=added_column)
 8577
 8578    def calculation_extract_snpeff_hgvs(
 8579        self,
 8580        snpeff_hgvs: str = "snpeff_hgvs",
 8581        snpeff_field: str = "ANN",
 8582    ) -> None:
 8583        """
 8584        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
 8585        annotation field in a VCF file and adds them as a new column in the variants table.
 8586
 8587        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
 8588        function is used to specify the name of the column that will store the HGVS nomenclatures
 8589        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
 8590        snpeff_hgvs
 8591        :type snpeff_hgvs: str (optional)
 8592        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
 8593        function represents the field in the VCF file that contains SnpEff annotations. This field is
 8594        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
 8595        to ANN
 8596        :type snpeff_field: str (optional)
 8597        """
 8598
 8599        # Snpeff hgvs tags
 8600        vcf_infos_tags = {
 8601            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
 8602        }
 8603
 8604        # Prefix
 8605        prefix = self.get_explode_infos_prefix()
 8606        if prefix:
 8607            prefix = "INFO/"
 8608
 8609        # snpEff fields
 8610        speff_ann_infos = prefix + snpeff_field
 8611        speff_hgvs_infos = prefix + snpeff_hgvs
 8612
 8613        # Variants table
 8614        table_variants = self.get_table_variants()
 8615
 8616        # Header
 8617        vcf_reader = self.get_header()
 8618
 8619        # Add columns
 8620        added_columns = []
 8621
 8622        # Explode HGVS field in column
 8623        added_columns += self.explode_infos(fields=[snpeff_field])
 8624
 8625        if snpeff_field in vcf_reader.infos:
 8626
 8627            log.debug(vcf_reader.infos[snpeff_field])
 8628
 8629            # Extract ANN header
 8630            ann_description = vcf_reader.infos[snpeff_field].desc
 8631            pattern = r"'(.+?)'"
 8632            match = re.search(pattern, ann_description)
 8633            if match:
 8634                ann_header_match = match.group(1).split(" | ")
 8635                ann_header_desc = {}
 8636                for i in range(len(ann_header_match)):
 8637                    ann_header_info = "".join(
 8638                        char for char in ann_header_match[i] if char.isalnum()
 8639                    )
 8640                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8641                if not ann_header_desc:
 8642                    raise ValueError("Invalid header description format")
 8643            else:
 8644                raise ValueError("Invalid header description format")
 8645
 8646            # Create variant id
 8647            variant_id_column = self.get_variant_id_column()
 8648            added_columns += [variant_id_column]
 8649
 8650            # Create dataframe
 8651            dataframe_snpeff_hgvs = self.get_query_to_df(
 8652                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8653            )
 8654
 8655            # Create main NOMEN column
 8656            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8657                speff_ann_infos
 8658            ].apply(
 8659                lambda x: extract_snpeff_hgvs(
 8660                    str(x), header=list(ann_header_desc.values())
 8661                )
 8662            )
 8663
 8664            # Add snpeff_hgvs to header
 8665            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
 8666                snpeff_hgvs,
 8667                ".",
 8668                "String",
 8669                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
 8670                "howard calculation",
 8671                "0",
 8672                self.code_type_map.get("String"),
 8673            )
 8674
 8675            # Update
 8676            sql_update = f"""
 8677                UPDATE variants
 8678                SET "INFO" = 
 8679                    concat(
 8680                        CASE
 8681                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8682                            THEN ''
 8683                            ELSE concat("INFO", ';')
 8684                        END,
 8685                        CASE 
 8686                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8687                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8688                            THEN concat(
 8689                                    '{snpeff_hgvs}=',
 8690                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8691                                )
 8692                            ELSE ''
 8693                        END
 8694                    )
 8695                FROM dataframe_snpeff_hgvs
 8696                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8697
 8698            """
 8699            self.conn.execute(sql_update)
 8700
 8701            # Delete dataframe
 8702            del dataframe_snpeff_hgvs
 8703            gc.collect()
 8704
 8705        else:
 8706
 8707            log.warning(
 8708                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8709            )
 8710
 8711        # Remove added columns
 8712        for added_column in added_columns:
 8713            self.drop_column(column=added_column)
 8714
 8715    def calculation_snpeff_ann_explode(
 8716        self,
 8717        uniquify: bool = True,
 8718        output_format: str = "fields",
 8719        output_prefix: str = "snpeff_",
 8720        snpeff_field: str = "ANN",
 8721    ) -> None:
 8722        """
 8723        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
 8724        exploding the HGVS field and updating variant information accordingly.
 8725
 8726        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
 8727        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
 8728        it indicates that the output should be unique, meaning that duplicate entries should be removed,
 8729        defaults to True
 8730        :type uniquify: bool (optional)
 8731        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
 8732        function specifies the format in which the output annotations will be generated. It has a
 8733        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
 8734        format, defaults to fields
 8735        :type output_format: str (optional)
 8736        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
 8737        method is used to specify the prefix that will be added to the output annotations generated
 8738        during the calculation process. This prefix helps to differentiate the newly added annotations
 8739        from existing ones in the output data. By default, the, defaults to ANN_
 8740        :type output_prefix: str (optional)
 8741        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
 8742        function is used to specify the field in the VCF file that contains SnpEff annotations. This
 8743        field will be processed to explode the HGVS annotations and update the variant information
 8744        accordingly, defaults to ANN
 8745        :type snpeff_field: str (optional)
 8746        """
 8747
 8748        # SnpEff annotation field
 8749        snpeff_hgvs = "snpeff_ann_explode"
 8750
 8751        # Snpeff hgvs tags
 8752        vcf_infos_tags = {
 8753            snpeff_hgvs: "Explode snpEff annotations",
 8754        }
 8755
 8756        # Prefix
 8757        prefix = self.get_explode_infos_prefix()
 8758        if prefix:
 8759            prefix = "INFO/"
 8760
 8761        # snpEff fields
 8762        speff_ann_infos = prefix + snpeff_field
 8763        speff_hgvs_infos = prefix + snpeff_hgvs
 8764
 8765        # Variants table
 8766        table_variants = self.get_table_variants()
 8767
 8768        # Header
 8769        vcf_reader = self.get_header()
 8770
 8771        # Add columns
 8772        added_columns = []
 8773
 8774        # Explode HGVS field in column
 8775        added_columns += self.explode_infos(fields=[snpeff_field])
 8776        log.debug(f"snpeff_field={snpeff_field}")
 8777        log.debug(f"added_columns={added_columns}")
 8778
 8779        if snpeff_field in vcf_reader.infos:
 8780
 8781            # Extract ANN header
 8782            ann_description = vcf_reader.infos[snpeff_field].desc
 8783            pattern = r"'(.+?)'"
 8784            match = re.search(pattern, ann_description)
 8785            if match:
 8786                ann_header_match = match.group(1).split(" | ")
 8787                ann_header = []
 8788                ann_header_desc = {}
 8789                for i in range(len(ann_header_match)):
 8790                    ann_header_info = "".join(
 8791                        char for char in ann_header_match[i] if char.isalnum()
 8792                    )
 8793                    ann_header.append(ann_header_info)
 8794                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8795                if not ann_header_desc:
 8796                    raise ValueError("Invalid header description format")
 8797            else:
 8798                raise ValueError("Invalid header description format")
 8799
 8800            # Create variant id
 8801            variant_id_column = self.get_variant_id_column()
 8802            added_columns += [variant_id_column]
 8803
 8804            # Create dataframe
 8805            dataframe_snpeff_hgvs = self.get_query_to_df(
 8806                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8807            )
 8808
 8809            # Create snpEff columns
 8810            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8811                speff_ann_infos
 8812            ].apply(
 8813                lambda x: explode_snpeff_ann(
 8814                    str(x),
 8815                    uniquify=uniquify,
 8816                    output_format=output_format,
 8817                    prefix=output_prefix,
 8818                    header=list(ann_header_desc.values()),
 8819                )
 8820            )
 8821
 8822            # Header
 8823            ann_annotations_prefix = ""
 8824            if output_format.upper() in ["JSON"]:
 8825                ann_annotations_prefix = f"{output_prefix}="
 8826                vcf_reader.infos[output_prefix] = vcf.parser._Info(
 8827                    output_prefix,
 8828                    ".",
 8829                    "String",
 8830                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8831                    + " - JSON format",
 8832                    "howard calculation",
 8833                    "0",
 8834                    self.code_type_map.get("String"),
 8835                )
 8836            else:
 8837                for ann_annotation in ann_header:
 8838                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
 8839                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
 8840                        ann_annotation_id,
 8841                        ".",
 8842                        "String",
 8843                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8844                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
 8845                        "howard calculation",
 8846                        "0",
 8847                        self.code_type_map.get("String"),
 8848                    )
 8849
 8850            # Update
 8851            sql_update = f"""
 8852                UPDATE variants
 8853                SET "INFO" = 
 8854                    concat(
 8855                        CASE
 8856                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8857                            THEN ''
 8858                            ELSE concat("INFO", ';')
 8859                        END,
 8860                        CASE 
 8861                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8862                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8863                            THEN concat(
 8864                                '{ann_annotations_prefix}',
 8865                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8866                                )
 8867                            ELSE ''
 8868                        END
 8869                    )
 8870                FROM dataframe_snpeff_hgvs
 8871                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8872
 8873            """
 8874            self.conn.execute(sql_update)
 8875
 8876            # Delete dataframe
 8877            del dataframe_snpeff_hgvs
 8878            gc.collect()
 8879
 8880        else:
 8881
 8882            log.warning(
 8883                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8884            )
 8885
 8886        # Remove added columns
 8887        for added_column in added_columns:
 8888            self.drop_column(column=added_column)
 8889
 8890    def calculation_extract_nomen(self) -> None:
 8891        """
 8892        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
 8893        """
 8894
 8895        # NOMEN field
 8896        field_nomen_dict = "NOMEN_DICT"
 8897
 8898        # NOMEN structure
 8899        nomen_dict = {
 8900            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
 8901            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
 8902            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
 8903            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
 8904            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
 8905            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
 8906            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
 8907            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
 8908            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
 8909            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
 8910        }
 8911
 8912        # Param
 8913        param = self.get_param()
 8914
 8915        # Prefix
 8916        prefix = self.get_explode_infos_prefix()
 8917
 8918        # Header
 8919        vcf_reader = self.get_header()
 8920
 8921        # Added columns
 8922        added_columns = []
 8923
 8924        # Get HGVS field
 8925        hgvs_field = (
 8926            param.get("calculation", {})
 8927            .get("calculations", {})
 8928            .get("NOMEN", {})
 8929            .get("options", {})
 8930            .get("hgvs_field", "hgvs")
 8931        )
 8932
 8933        # Get NOMEN pattern
 8934        nomen_pattern = (
 8935            param.get("calculation", {})
 8936            .get("calculations", {})
 8937            .get("NOMEN", {})
 8938            .get("options", {})
 8939            .get("pattern", None)
 8940        )
 8941
 8942        # transcripts list of preference sources
 8943        transcripts_sources = {}
 8944
 8945        # Get transcripts
 8946        transcripts_file = (
 8947            param.get("calculation", {})
 8948            .get("calculations", {})
 8949            .get("NOMEN", {})
 8950            .get("options", {})
 8951            .get("transcripts", None)
 8952        )
 8953        transcripts_file = full_path(transcripts_file)
 8954        if transcripts_file:
 8955            if os.path.exists(transcripts_file):
 8956                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
 8957                transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist()
 8958                transcripts_sources["file"] = transcripts_from_file
 8959            else:
 8960                msg_err = f"Transcript file '{transcripts_file}' does NOT exist"
 8961                log.error(msg_err)
 8962                raise ValueError(msg_err)
 8963
 8964        # Get transcripts table
 8965        transcripts_table = (
 8966            param.get("calculation", {})
 8967            .get("calculations", {})
 8968            .get("NOMEN", {})
 8969            .get("options", {})
 8970            .get("transcripts_table", self.get_table_variants())
 8971        )
 8972        # Get transcripts column
 8973        transcripts_column = (
 8974            param.get("calculation", {})
 8975            .get("calculations", {})
 8976            .get("NOMEN", {})
 8977            .get("options", {})
 8978            .get("transcripts_column", None)
 8979        )
 8980
 8981        if transcripts_table and transcripts_column:
 8982            extra_field_transcript = f"{transcripts_table}.{transcripts_column}"
 8983            # Explode if not exists
 8984            added_columns += self.explode_infos(fields=[transcripts_column], table=transcripts_table)
 8985        else:
 8986            extra_field_transcript = f"NULL"
 8987
 8988        # Transcripts of preference source order
 8989        transcripts_order = (
 8990            param.get("calculation", {})
 8991            .get("calculations", {})
 8992            .get("NOMEN", {})
 8993            .get("options", {})
 8994            .get("transcripts_order", ["column", "file"])
 8995        )
 8996
 8997        # Transcripts from file
 8998        transcripts = transcripts_sources.get("file", [])
 8999
 9000        # Explode HGVS field in column
 9001        added_columns += self.explode_infos(fields=[hgvs_field])
 9002
 9003        # extra infos
 9004        extra_infos = self.get_extra_infos()
 9005        extra_field = prefix + hgvs_field
 9006
 9007        if extra_field in extra_infos:
 9008
 9009            # Create dataframe
 9010            dataframe_hgvs = self.get_query_to_df(
 9011                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """
 9012            )
 9013
 9014            # Create main NOMEN column
 9015            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply(
 9016                lambda x: find_nomen(
 9017                    hgvs=x.hgvs,
 9018                    transcript=x.transcript,
 9019                    transcripts=transcripts,
 9020                    pattern=nomen_pattern,
 9021                    transcripts_source_order=transcripts_order,
 9022                ),
 9023                axis=1,
 9024            )
 9025
 9026            # Explode NOMEN Structure and create SQL set for update
 9027            sql_nomen_fields = []
 9028            for nomen_field in nomen_dict:
 9029
 9030                # Explode each field into a column
 9031                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
 9032                    lambda x: dict(x).get(nomen_field, "")
 9033                )
 9034
 9035                # Create VCF header field
 9036                vcf_reader.infos[nomen_field] = vcf.parser._Info(
 9037                    nomen_field,
 9038                    ".",
 9039                    "String",
 9040                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
 9041                    "howard calculation",
 9042                    "0",
 9043                    self.code_type_map.get("String"),
 9044                )
 9045                sql_nomen_fields.append(
 9046                    f"""
 9047                        CASE 
 9048                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
 9049                            THEN concat(
 9050                                    ';{nomen_field}=',
 9051                                    dataframe_hgvs."{nomen_field}"
 9052                                )
 9053                            ELSE ''
 9054                        END
 9055                    """
 9056                )
 9057
 9058            # SQL set for update
 9059            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
 9060
 9061            # Update
 9062            sql_update = f"""
 9063                UPDATE variants
 9064                SET "INFO" = 
 9065                    concat(
 9066                        CASE
 9067                            WHEN "INFO" IS NULL
 9068                            THEN ''
 9069                            ELSE "INFO"
 9070                        END,
 9071                        {sql_nomen_fields_set}
 9072                    )
 9073                FROM dataframe_hgvs
 9074                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
 9075                    AND variants."POS" = dataframe_hgvs."POS" 
 9076                    AND variants."REF" = dataframe_hgvs."REF"
 9077                    AND variants."ALT" = dataframe_hgvs."ALT"
 9078            """
 9079            self.conn.execute(sql_update)
 9080
 9081            # Delete dataframe
 9082            del dataframe_hgvs
 9083            gc.collect()
 9084
 9085        # Remove added columns
 9086        for added_column in added_columns:
 9087            self.drop_column(column=added_column)
 9088
 9089    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
 9090        """
 9091        The function `calculation_find_by_pipeline` performs a calculation to find the number of
 9092        pipeline/sample for a variant and updates the variant information in a VCF file.
 9093
 9094        :param tag: The `tag` parameter is a string that represents the annotation field for the
 9095        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
 9096        VCF header and to update the corresponding field in the variants table, defaults to
 9097        findbypipeline
 9098        :type tag: str (optional)
 9099        """
 9100
 9101        # if FORMAT and samples
 9102        if (
 9103            "FORMAT" in self.get_header_columns_as_list()
 9104            and self.get_header_sample_list()
 9105        ):
 9106
 9107            # findbypipeline annotation field
 9108            findbypipeline_tag = tag
 9109
 9110            # VCF infos tags
 9111            vcf_infos_tags = {
 9112                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
 9113            }
 9114
 9115            # Prefix
 9116            prefix = self.get_explode_infos_prefix()
 9117
 9118            # Field
 9119            findbypipeline_infos = prefix + findbypipeline_tag
 9120
 9121            # Variants table
 9122            table_variants = self.get_table_variants()
 9123
 9124            # Header
 9125            vcf_reader = self.get_header()
 9126
 9127            # Create variant id
 9128            variant_id_column = self.get_variant_id_column()
 9129            added_columns = [variant_id_column]
 9130
 9131            # variant_id, FORMAT and samples
 9132            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9133                self.get_header_sample_list()
 9134            )
 9135
 9136            # Create dataframe
 9137            dataframe_findbypipeline = self.get_query_to_df(
 9138                f""" SELECT {samples_fields} FROM {table_variants} """
 9139            )
 9140
 9141            # Create findbypipeline column
 9142            dataframe_findbypipeline[findbypipeline_infos] = (
 9143                dataframe_findbypipeline.apply(
 9144                    lambda row: findbypipeline(
 9145                        row, samples=self.get_header_sample_list()
 9146                    ),
 9147                    axis=1,
 9148                )
 9149            )
 9150
 9151            # Add snpeff_hgvs to header
 9152            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
 9153                findbypipeline_tag,
 9154                ".",
 9155                "String",
 9156                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
 9157                "howard calculation",
 9158                "0",
 9159                self.code_type_map.get("String"),
 9160            )
 9161
 9162            # Update
 9163            sql_update = f"""
 9164                UPDATE variants
 9165                SET "INFO" = 
 9166                    concat(
 9167                        CASE
 9168                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9169                            THEN ''
 9170                            ELSE concat("INFO", ';')
 9171                        END,
 9172                        CASE 
 9173                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
 9174                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
 9175                            THEN concat(
 9176                                    '{findbypipeline_tag}=',
 9177                                    dataframe_findbypipeline."{findbypipeline_infos}"
 9178                                )
 9179                            ELSE ''
 9180                        END
 9181                    )
 9182                FROM dataframe_findbypipeline
 9183                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
 9184            """
 9185            self.conn.execute(sql_update)
 9186
 9187            # Remove added columns
 9188            for added_column in added_columns:
 9189                self.drop_column(column=added_column)
 9190
 9191            # Delete dataframe
 9192            del dataframe_findbypipeline
 9193            gc.collect()
 9194
 9195    def calculation_genotype_concordance(self) -> None:
 9196        """
 9197        The function `calculation_genotype_concordance` calculates the genotype concordance for
 9198        multi-caller VCF files and updates the variant information in the database.
 9199        """
 9200
 9201        # if FORMAT and samples
 9202        if (
 9203            "FORMAT" in self.get_header_columns_as_list()
 9204            and self.get_header_sample_list()
 9205        ):
 9206
 9207            # genotypeconcordance annotation field
 9208            genotypeconcordance_tag = "genotypeconcordance"
 9209
 9210            # VCF infos tags
 9211            vcf_infos_tags = {
 9212                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
 9213            }
 9214
 9215            # Prefix
 9216            prefix = self.get_explode_infos_prefix()
 9217
 9218            # Field
 9219            genotypeconcordance_infos = prefix + genotypeconcordance_tag
 9220
 9221            # Variants table
 9222            table_variants = self.get_table_variants()
 9223
 9224            # Header
 9225            vcf_reader = self.get_header()
 9226
 9227            # Create variant id
 9228            variant_id_column = self.get_variant_id_column()
 9229            added_columns = [variant_id_column]
 9230
 9231            # variant_id, FORMAT and samples
 9232            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9233                self.get_header_sample_list()
 9234            )
 9235
 9236            # Create dataframe
 9237            dataframe_genotypeconcordance = self.get_query_to_df(
 9238                f""" SELECT {samples_fields} FROM {table_variants} """
 9239            )
 9240
 9241            # Create genotypeconcordance column
 9242            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
 9243                dataframe_genotypeconcordance.apply(
 9244                    lambda row: genotypeconcordance(
 9245                        row, samples=self.get_header_sample_list()
 9246                    ),
 9247                    axis=1,
 9248                )
 9249            )
 9250
 9251            # Add genotypeconcordance to header
 9252            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
 9253                genotypeconcordance_tag,
 9254                ".",
 9255                "String",
 9256                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
 9257                "howard calculation",
 9258                "0",
 9259                self.code_type_map.get("String"),
 9260            )
 9261
 9262            # Update
 9263            sql_update = f"""
 9264                UPDATE variants
 9265                SET "INFO" = 
 9266                    concat(
 9267                        CASE
 9268                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9269                            THEN ''
 9270                            ELSE concat("INFO", ';')
 9271                        END,
 9272                        CASE
 9273                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
 9274                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
 9275                            THEN concat(
 9276                                    '{genotypeconcordance_tag}=',
 9277                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
 9278                                )
 9279                            ELSE ''
 9280                        END
 9281                    )
 9282                FROM dataframe_genotypeconcordance
 9283                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
 9284            """
 9285            self.conn.execute(sql_update)
 9286
 9287            # Remove added columns
 9288            for added_column in added_columns:
 9289                self.drop_column(column=added_column)
 9290
 9291            # Delete dataframe
 9292            del dataframe_genotypeconcordance
 9293            gc.collect()
 9294
 9295    def calculation_barcode(self, tag: str = "barcode") -> None:
 9296        """
 9297        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
 9298        updates the INFO field in the file with the calculated barcode values.
 9299
 9300        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
 9301        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
 9302        the default tag name is set to "barcode", defaults to barcode
 9303        :type tag: str (optional)
 9304        """
 9305
 9306        # if FORMAT and samples
 9307        if (
 9308            "FORMAT" in self.get_header_columns_as_list()
 9309            and self.get_header_sample_list()
 9310        ):
 9311
 9312            # barcode annotation field
 9313            if not tag:
 9314                tag = "barcode"
 9315
 9316            # VCF infos tags
 9317            vcf_infos_tags = {
 9318                tag: "barcode calculation (VaRank)",
 9319            }
 9320
 9321            # Prefix
 9322            prefix = self.get_explode_infos_prefix()
 9323
 9324            # Field
 9325            barcode_infos = prefix + tag
 9326
 9327            # Variants table
 9328            table_variants = self.get_table_variants()
 9329
 9330            # Header
 9331            vcf_reader = self.get_header()
 9332
 9333            # Create variant id
 9334            variant_id_column = self.get_variant_id_column()
 9335            added_columns = [variant_id_column]
 9336
 9337            # variant_id, FORMAT and samples
 9338            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9339                self.get_header_sample_list()
 9340            )
 9341
 9342            # Create dataframe
 9343            dataframe_barcode = self.get_query_to_df(
 9344                f""" SELECT {samples_fields} FROM {table_variants} """
 9345            )
 9346
 9347            # Create barcode column
 9348            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9349                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
 9350            )
 9351
 9352            # Add barcode to header
 9353            vcf_reader.infos[tag] = vcf.parser._Info(
 9354                tag,
 9355                ".",
 9356                "String",
 9357                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
 9358                "howard calculation",
 9359                "0",
 9360                self.code_type_map.get("String"),
 9361            )
 9362
 9363            # Update
 9364            sql_update = f"""
 9365                UPDATE {table_variants}
 9366                SET "INFO" = 
 9367                    concat(
 9368                        CASE
 9369                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9370                            THEN ''
 9371                            ELSE concat("INFO", ';')
 9372                        END,
 9373                        CASE
 9374                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
 9375                            AND dataframe_barcode."{barcode_infos}" NOT NULL
 9376                            THEN concat(
 9377                                    '{tag}=',
 9378                                    dataframe_barcode."{barcode_infos}"
 9379                                )
 9380                            ELSE ''
 9381                        END
 9382                    )
 9383                FROM dataframe_barcode
 9384                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9385            """
 9386            self.conn.execute(sql_update)
 9387
 9388            # Remove added columns
 9389            for added_column in added_columns:
 9390                self.drop_column(column=added_column)
 9391
 9392            # Delete dataframe
 9393            del dataframe_barcode
 9394            gc.collect()
 9395
 9396    def calculation_barcode_family(self, tag: str = "BCF") -> None:
 9397        """
 9398        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
 9399        and updates the INFO field in the file with the calculated barcode values.
 9400
 9401        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
 9402        the barcode tag that will be added to the VCF file during the calculation process. If no value
 9403        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
 9404        :type tag: str (optional)
 9405        """
 9406
 9407        # if FORMAT and samples
 9408        if (
 9409            "FORMAT" in self.get_header_columns_as_list()
 9410            and self.get_header_sample_list()
 9411        ):
 9412
 9413            # barcode annotation field
 9414            if not tag:
 9415                tag = "BCF"
 9416
 9417            # VCF infos tags
 9418            vcf_infos_tags = {
 9419                tag: "barcode family calculation",
 9420                f"{tag}S": "barcode family samples",
 9421            }
 9422
 9423            # Param
 9424            param = self.get_param()
 9425            log.debug(f"param={param}")
 9426
 9427            # Prefix
 9428            prefix = self.get_explode_infos_prefix()
 9429
 9430            # PED param
 9431            ped = (
 9432                param.get("calculation", {})
 9433                .get("calculations", {})
 9434                .get("BARCODEFAMILY", {})
 9435                .get("family_pedigree", None)
 9436            )
 9437            log.debug(f"ped={ped}")
 9438
 9439            # Load PED
 9440            if ped:
 9441
 9442                # Pedigree is a file
 9443                if isinstance(ped, str) and os.path.exists(full_path(ped)):
 9444                    log.debug("Pedigree is file")
 9445                    with open(full_path(ped)) as ped:
 9446                        ped = yaml.safe_load(ped)
 9447
 9448                # Pedigree is a string
 9449                elif isinstance(ped, str):
 9450                    log.debug("Pedigree is str")
 9451                    try:
 9452                        ped = json.loads(ped)
 9453                        log.debug("Pedigree is json str")
 9454                    except ValueError as e:
 9455                        ped_samples = ped.split(",")
 9456                        ped = {}
 9457                        for ped_sample in ped_samples:
 9458                            ped[ped_sample] = ped_sample
 9459
 9460                # Pedigree is a dict
 9461                elif isinstance(ped, dict):
 9462                    log.debug("Pedigree is dict")
 9463
 9464                # Pedigree is not well formatted
 9465                else:
 9466                    msg_error = "Pedigree not well formatted"
 9467                    log.error(msg_error)
 9468                    raise ValueError(msg_error)
 9469
 9470                # Construct list
 9471                ped_samples = list(ped.values())
 9472
 9473            else:
 9474                log.debug("Pedigree not defined. Take all samples")
 9475                ped_samples = self.get_header_sample_list()
 9476                ped = {}
 9477                for ped_sample in ped_samples:
 9478                    ped[ped_sample] = ped_sample
 9479
 9480            # Check pedigree
 9481            if not ped or len(ped) == 0:
 9482                msg_error = f"Error in pedigree: samples {ped_samples}"
 9483                log.error(msg_error)
 9484                raise ValueError(msg_error)
 9485
 9486            # Log
 9487            log.info(
 9488                "Calculation 'BARCODEFAMILY' - Samples: "
 9489                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
 9490            )
 9491            log.debug(f"ped_samples={ped_samples}")
 9492
 9493            # Field
 9494            barcode_infos = prefix + tag
 9495
 9496            # Variants table
 9497            table_variants = self.get_table_variants()
 9498
 9499            # Header
 9500            vcf_reader = self.get_header()
 9501
 9502            # Create variant id
 9503            variant_id_column = self.get_variant_id_column()
 9504            added_columns = [variant_id_column]
 9505
 9506            # variant_id, FORMAT and samples
 9507            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9508                ped_samples
 9509            )
 9510
 9511            # Create dataframe
 9512            dataframe_barcode = self.get_query_to_df(
 9513                f""" SELECT {samples_fields} FROM {table_variants} """
 9514            )
 9515
 9516            # Create barcode column
 9517            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9518                lambda row: barcode(row, samples=ped_samples), axis=1
 9519            )
 9520
 9521            # Add barcode family to header
 9522            # Add vaf_normalization to header
 9523            vcf_reader.formats[tag] = vcf.parser._Format(
 9524                id=tag,
 9525                num=".",
 9526                type="String",
 9527                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
 9528                type_code=self.code_type_map.get("String"),
 9529            )
 9530            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
 9531                id=f"{tag}S",
 9532                num=".",
 9533                type="String",
 9534                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
 9535                type_code=self.code_type_map.get("String"),
 9536            )
 9537
 9538            # Update
 9539            # for sample in ped_samples:
 9540            sql_update_set = []
 9541            for sample in self.get_header_sample_list() + ["FORMAT"]:
 9542                if sample in ped_samples:
 9543                    value = f'dataframe_barcode."{barcode_infos}"'
 9544                    value_samples = "'" + ",".join(ped_samples) + "'"
 9545                elif sample == "FORMAT":
 9546                    value = f"'{tag}'"
 9547                    value_samples = f"'{tag}S'"
 9548                else:
 9549                    value = "'.'"
 9550                    value_samples = "'.'"
 9551                format_regex = r"[a-zA-Z0-9\s]"
 9552                sql_update_set.append(
 9553                    f"""
 9554                        "{sample}" = 
 9555                        concat(
 9556                            CASE
 9557                                WHEN {table_variants}."{sample}" = './.'
 9558                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
 9559                                ELSE {table_variants}."{sample}"
 9560                            END,
 9561                            ':',
 9562                            {value},
 9563                            ':',
 9564                            {value_samples}
 9565                        )
 9566                    """
 9567                )
 9568
 9569            sql_update_set_join = ", ".join(sql_update_set)
 9570            sql_update = f"""
 9571                UPDATE {table_variants}
 9572                SET {sql_update_set_join}
 9573                FROM dataframe_barcode
 9574                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9575            """
 9576            self.conn.execute(sql_update)
 9577
 9578            # Remove added columns
 9579            for added_column in added_columns:
 9580                self.drop_column(column=added_column)
 9581
 9582            # Delete dataframe
 9583            del dataframe_barcode
 9584            gc.collect()
 9585
 9586    def calculation_trio(self) -> None:
 9587        """
 9588        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
 9589        information to the INFO field of each variant.
 9590        """
 9591
 9592        # if FORMAT and samples
 9593        if (
 9594            "FORMAT" in self.get_header_columns_as_list()
 9595            and self.get_header_sample_list()
 9596        ):
 9597
 9598            # trio annotation field
 9599            trio_tag = "trio"
 9600
 9601            # VCF infos tags
 9602            vcf_infos_tags = {
 9603                "trio": "trio calculation",
 9604            }
 9605
 9606            # Param
 9607            param = self.get_param()
 9608
 9609            # Prefix
 9610            prefix = self.get_explode_infos_prefix()
 9611
 9612            # Trio param
 9613            trio_ped = (
 9614                param.get("calculation", {})
 9615                .get("calculations", {})
 9616                .get("TRIO", {})
 9617                .get("trio_pedigree", None)
 9618            )
 9619
 9620            # Load trio
 9621            if trio_ped:
 9622
 9623                # Trio pedigree is a file
 9624                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
 9625                    log.debug("TRIO pedigree is file")
 9626                    with open(full_path(trio_ped)) as trio_ped:
 9627                        trio_ped = yaml.safe_load(trio_ped)
 9628
 9629                # Trio pedigree is a string
 9630                elif isinstance(trio_ped, str):
 9631                    log.debug("TRIO pedigree is str")
 9632                    try:
 9633                        trio_ped = json.loads(trio_ped)
 9634                        log.debug("TRIO pedigree is json str")
 9635                    except ValueError as e:
 9636                        trio_samples = trio_ped.split(",")
 9637                        if len(trio_samples) == 3:
 9638                            trio_ped = {
 9639                                "father": trio_samples[0],
 9640                                "mother": trio_samples[1],
 9641                                "child": trio_samples[2],
 9642                            }
 9643                            log.debug("TRIO pedigree is list str")
 9644                        else:
 9645                            msg_error = "TRIO pedigree not well formatted"
 9646                            log.error(msg_error)
 9647                            raise ValueError(msg_error)
 9648
 9649                # Trio pedigree is a dict
 9650                elif isinstance(trio_ped, dict):
 9651                    log.debug("TRIO pedigree is dict")
 9652
 9653                # Trio pedigree is not well formatted
 9654                else:
 9655                    msg_error = "TRIO pedigree not well formatted"
 9656                    log.error(msg_error)
 9657                    raise ValueError(msg_error)
 9658
 9659                # Construct trio list
 9660                trio_samples = [
 9661                    trio_ped.get("father", ""),
 9662                    trio_ped.get("mother", ""),
 9663                    trio_ped.get("child", ""),
 9664                ]
 9665
 9666            else:
 9667                log.debug("TRIO pedigree not defined. Take the first 3 samples")
 9668                samples_list = self.get_header_sample_list()
 9669                if len(samples_list) >= 3:
 9670                    trio_samples = self.get_header_sample_list()[0:3]
 9671                    trio_ped = {
 9672                        "father": trio_samples[0],
 9673                        "mother": trio_samples[1],
 9674                        "child": trio_samples[2],
 9675                    }
 9676                else:
 9677                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
 9678                    log.error(msg_error)
 9679                    raise ValueError(msg_error)
 9680
 9681            # Check trio pedigree
 9682            if not trio_ped or len(trio_ped) != 3:
 9683                msg_error = f"Error in TRIO pedigree: {trio_ped}"
 9684                log.error(msg_error)
 9685                raise ValueError(msg_error)
 9686
 9687            # Log
 9688            log.info(
 9689                f"Calculation 'TRIO' - Samples: "
 9690                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
 9691            )
 9692
 9693            # Field
 9694            trio_infos = prefix + trio_tag
 9695
 9696            # Variants table
 9697            table_variants = self.get_table_variants()
 9698
 9699            # Header
 9700            vcf_reader = self.get_header()
 9701
 9702            # Create variant id
 9703            variant_id_column = self.get_variant_id_column()
 9704            added_columns = [variant_id_column]
 9705
 9706            # variant_id, FORMAT and samples
 9707            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9708                self.get_header_sample_list()
 9709            )
 9710
 9711            # Create dataframe
 9712            dataframe_trio = self.get_query_to_df(
 9713                f""" SELECT {samples_fields} FROM {table_variants} """
 9714            )
 9715
 9716            # Create trio column
 9717            dataframe_trio[trio_infos] = dataframe_trio.apply(
 9718                lambda row: trio(row, samples=trio_samples), axis=1
 9719            )
 9720
 9721            # Add trio to header
 9722            vcf_reader.infos[trio_tag] = vcf.parser._Info(
 9723                trio_tag,
 9724                ".",
 9725                "String",
 9726                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
 9727                "howard calculation",
 9728                "0",
 9729                self.code_type_map.get("String"),
 9730            )
 9731
 9732            # Update
 9733            sql_update = f"""
 9734                UPDATE {table_variants}
 9735                SET "INFO" = 
 9736                    concat(
 9737                        CASE
 9738                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9739                            THEN ''
 9740                            ELSE concat("INFO", ';')
 9741                        END,
 9742                        CASE
 9743                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
 9744                             AND dataframe_trio."{trio_infos}" NOT NULL
 9745                            THEN concat(
 9746                                    '{trio_tag}=',
 9747                                    dataframe_trio."{trio_infos}"
 9748                                )
 9749                            ELSE ''
 9750                        END
 9751                    )
 9752                FROM dataframe_trio
 9753                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
 9754            """
 9755            self.conn.execute(sql_update)
 9756
 9757            # Remove added columns
 9758            for added_column in added_columns:
 9759                self.drop_column(column=added_column)
 9760
 9761            # Delete dataframe
 9762            del dataframe_trio
 9763            gc.collect()
 9764
 9765    def calculation_vaf_normalization(self) -> None:
 9766        """
 9767        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
 9768        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
 9769        :return: The function does not return anything.
 9770        """
 9771
 9772        # if FORMAT and samples
 9773        if (
 9774            "FORMAT" in self.get_header_columns_as_list()
 9775            and self.get_header_sample_list()
 9776        ):
 9777
 9778            # vaf_normalization annotation field
 9779            vaf_normalization_tag = "VAF"
 9780
 9781            # VCF infos tags
 9782            vcf_infos_tags = {
 9783                "VAF": "VAF Variant Frequency",
 9784            }
 9785
 9786            # Prefix
 9787            prefix = self.get_explode_infos_prefix()
 9788
 9789            # Variants table
 9790            table_variants = self.get_table_variants()
 9791
 9792            # Header
 9793            vcf_reader = self.get_header()
 9794
 9795            # Do not calculate if VAF already exists
 9796            if "VAF" in vcf_reader.formats:
 9797                log.debug("VAF already on genotypes")
 9798                return
 9799
 9800            # Create variant id
 9801            variant_id_column = self.get_variant_id_column()
 9802            added_columns = [variant_id_column]
 9803
 9804            # variant_id, FORMAT and samples
 9805            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9806                f""" "{sample}" """ for sample in self.get_header_sample_list()
 9807            )
 9808
 9809            # Create dataframe
 9810            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
 9811            log.debug(f"query={query}")
 9812            dataframe_vaf_normalization = self.get_query_to_df(query=query)
 9813
 9814            vaf_normalization_set = []
 9815
 9816            # for each sample vaf_normalization
 9817            for sample in self.get_header_sample_list():
 9818                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
 9819                    lambda row: vaf_normalization(row, sample=sample), axis=1
 9820                )
 9821                vaf_normalization_set.append(
 9822                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
 9823                )
 9824
 9825            # Add VAF to FORMAT
 9826            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
 9827                "FORMAT"
 9828            ].apply(lambda x: str(x) + ":VAF")
 9829            vaf_normalization_set.append(
 9830                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
 9831            )
 9832
 9833            # Add vaf_normalization to header
 9834            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
 9835                id=vaf_normalization_tag,
 9836                num="1",
 9837                type="Float",
 9838                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
 9839                type_code=self.code_type_map.get("Float"),
 9840            )
 9841
 9842            # Create fields to add in INFO
 9843            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
 9844
 9845            # Update
 9846            sql_update = f"""
 9847                UPDATE {table_variants}
 9848                SET {sql_vaf_normalization_set}
 9849                FROM dataframe_vaf_normalization
 9850                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
 9851
 9852            """
 9853            self.conn.execute(sql_update)
 9854
 9855            # Remove added columns
 9856            for added_column in added_columns:
 9857                self.drop_column(column=added_column)
 9858
 9859            # Delete dataframe
 9860            del dataframe_vaf_normalization
 9861            gc.collect()
 9862
 9863    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9864        """
 9865        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9866        field in a VCF file and updates the INFO column of the variants table with the calculated
 9867        statistics.
 9868
 9869        :param info: The `info` parameter is a string that represents the type of information for which
 9870        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9871        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9872        maximum value, the mean, the median, defaults to VAF
 9873        :type info: str (optional)
 9874        """
 9875
 9876        # if FORMAT and samples
 9877        if (
 9878            "FORMAT" in self.get_header_columns_as_list()
 9879            and self.get_header_sample_list()
 9880        ):
 9881
 9882            # vaf_stats annotation field
 9883            vaf_stats_tag = info + "_stats"
 9884
 9885            # VCF infos tags
 9886            vcf_infos_tags = {
 9887                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9888                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9889                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9890                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9891                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9892                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9893                info
 9894                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9895            }
 9896
 9897            # Prefix
 9898            prefix = self.get_explode_infos_prefix()
 9899
 9900            # Field
 9901            vaf_stats_infos = prefix + vaf_stats_tag
 9902
 9903            # Variants table
 9904            table_variants = self.get_table_variants()
 9905
 9906            # Header
 9907            vcf_reader = self.get_header()
 9908
 9909            # Create variant id
 9910            variant_id_column = self.get_variant_id_column()
 9911            added_columns = [variant_id_column]
 9912
 9913            # variant_id, FORMAT and samples
 9914            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9915                self.get_header_sample_list()
 9916            )
 9917
 9918            # Create dataframe
 9919            dataframe_vaf_stats = self.get_query_to_df(
 9920                f""" SELECT {samples_fields} FROM {table_variants} """
 9921            )
 9922
 9923            # Create vaf_stats column
 9924            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
 9925                lambda row: genotype_stats(
 9926                    row, samples=self.get_header_sample_list(), info=info
 9927                ),
 9928                axis=1,
 9929            )
 9930
 9931            # List of vcf tags
 9932            sql_vaf_stats_fields = []
 9933
 9934            # Check all VAF stats infos
 9935            for stat in vcf_infos_tags:
 9936
 9937                # Extract stats
 9938                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
 9939                    lambda x: dict(x).get(stat, "")
 9940                )
 9941
 9942                # Add snpeff_hgvs to header
 9943                vcf_reader.infos[stat] = vcf.parser._Info(
 9944                    stat,
 9945                    ".",
 9946                    "String",
 9947                    vcf_infos_tags.get(stat, "genotype statistics"),
 9948                    "howard calculation",
 9949                    "0",
 9950                    self.code_type_map.get("String"),
 9951                )
 9952
 9953                if len(sql_vaf_stats_fields):
 9954                    sep = ";"
 9955                else:
 9956                    sep = ""
 9957
 9958                # Create fields to add in INFO
 9959                sql_vaf_stats_fields.append(
 9960                    f"""
 9961                        CASE
 9962                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
 9963                            THEN concat(
 9964                                    '{sep}{stat}=',
 9965                                    dataframe_vaf_stats."{stat}"
 9966                                )
 9967                            ELSE ''
 9968                        END
 9969                    """
 9970                )
 9971
 9972            # SQL set for update
 9973            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
 9974
 9975            # Update
 9976            sql_update = f"""
 9977                UPDATE {table_variants}
 9978                SET "INFO" = 
 9979                    concat(
 9980                        CASE
 9981                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9982                            THEN ''
 9983                            ELSE concat("INFO", ';')
 9984                        END,
 9985                        {sql_vaf_stats_fields_set}
 9986                    )
 9987                FROM dataframe_vaf_stats
 9988                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
 9989
 9990            """
 9991            self.conn.execute(sql_update)
 9992
 9993            # Remove added columns
 9994            for added_column in added_columns:
 9995                self.drop_column(column=added_column)
 9996
 9997            # Delete dataframe
 9998            del dataframe_vaf_stats
 9999            gc.collect()
10000
10001    def calculation_transcripts_annotation(
10002        self, info_json: str = None, info_format: str = None
10003    ) -> None:
10004        """
10005        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
10006        field to it if transcripts are available.
10007
10008        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
10009        is a string parameter that represents the information field to be used in the transcripts JSON.
10010        It is used to specify the JSON format for the transcripts information. If no value is provided
10011        when calling the method, it defaults to "
10012        :type info_json: str
10013        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
10014        method is a string parameter that specifies the format of the information field to be used in
10015        the transcripts JSON. It is used to define the format of the information field
10016        :type info_format: str
10017        """
10018
10019        # Create transcripts table
10020        transcripts_table = self.create_transcript_view()
10021
10022        # Add info field
10023        if transcripts_table:
10024            self.transcript_view_to_variants(
10025                transcripts_table=transcripts_table,
10026                transcripts_info_field_json=info_json,
10027                transcripts_info_field_format=info_format,
10028            )
10029        else:
10030            log.info("No Transcripts to process. Check param.json file configuration")
10031
10032    def calculation_transcripts_prioritization(self) -> None:
10033        """
10034        The function `calculation_transcripts_prioritization` creates a transcripts table and
10035        prioritizes transcripts based on certain criteria.
10036        """
10037
10038        # Create transcripts table
10039        transcripts_table = self.create_transcript_view()
10040
10041        # Add info field
10042        if transcripts_table:
10043            self.transcripts_prioritization(transcripts_table=transcripts_table)
10044        else:
10045            log.info("No Transcripts to process. Check param.json file configuration")
10046
10047    def calculation_transcripts_export(self) -> None:
10048        """ """
10049
10050        # Create transcripts table
10051        transcripts_table = self.create_transcript_view()
10052
10053        # Add info field
10054        if transcripts_table:
10055            self.transcripts_export(transcripts_table=transcripts_table)
10056        else:
10057            log.info("No Transcripts to process. Check param.json file configuration")
10058
10059    ###############
10060    # Transcripts #
10061    ###############
10062
10063    def transcripts_export(
10064        self, transcripts_table: str = None, param: dict = {}
10065    ) -> bool:
10066        """ """
10067
10068        log.debug("Start transcripts export...")
10069
10070        # Param
10071        if not param:
10072            param = self.get_param()
10073
10074        # Param export
10075        param_transcript_export = param.get("transcripts", {}).get("export", {})
10076
10077        # Output file
10078        transcripts_export_output = param_transcript_export.get("output", None)
10079
10080        if not param_transcript_export or not transcripts_export_output:
10081            log.warning(f"No transcriipts export parameters defined!")
10082            return False
10083
10084        # List of transcripts annotations
10085        query_describe = f"""
10086            SELECT column_name
10087            FROM (
10088                    DESCRIBE SELECT * FROM {transcripts_table}
10089                )
10090            WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO')
10091        """
10092        transcripts_annotations_list = list(
10093            self.get_query_to_df(query=query_describe)["column_name"]
10094        )
10095
10096        # Create transcripts table for export
10097        transcripts_table_export = f"{transcripts_table}_export_" + "".join(
10098            random.choices(string.ascii_uppercase + string.digits, k=10)
10099        )
10100        query_create_transcripts_table_export = f"""
10101            CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table})
10102        """
10103        self.execute_query(query=query_create_transcripts_table_export)
10104
10105        # Output file format
10106        transcripts_export_output_format = get_file_format(
10107            filename=transcripts_export_output
10108        )
10109
10110        # Format VCF - construct INFO
10111        if transcripts_export_output_format in ["vcf"]:
10112
10113            # Construct query update INFO and header
10114            query_update_info = []
10115            for field in transcripts_annotations_list:
10116
10117                # If field not in header
10118                if field not in self.get_header_infos_list():
10119
10120                    # Add PZ Transcript in header
10121                    self.get_header().infos[field] = vcf.parser._Info(
10122                        field,
10123                        ".",
10124                        "String",
10125                        f"Annotation '{field}' from transcript view",
10126                        "unknown",
10127                        "unknown",
10128                        0,
10129                    )
10130
10131                # Add field as INFO/tag
10132                query_update_info.append(
10133                    f"""
10134                        CASE
10135                            WHEN "{field}" IS NOT NULL
10136                            THEN concat('{field}=', "{field}", ';')    
10137                            ELSE ''     
10138                        END
10139                        """
10140                )
10141
10142            # Query param
10143            query_update_info_value = (
10144                f""" concat('',  {", ".join(query_update_info)}) """
10145            )
10146            query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """
10147
10148        else:
10149
10150            # Query param
10151            query_update_info_value = f""" NULL """
10152            query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """
10153
10154        # Update query INFO column
10155        query_update = f"""
10156            UPDATE {transcripts_table_export}
10157            SET INFO = {query_update_info_value}
10158
10159        """
10160        self.execute_query(query=query_update)
10161
10162        # Export
10163        self.export_output(
10164            output_file=transcripts_export_output,
10165            query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """,
10166        )
10167
10168        # Drop transcripts export table
10169        query_drop_transcripts_table_export = f"""
10170            DROP TABLE {transcripts_table_export}
10171        """
10172        self.execute_query(query=query_drop_transcripts_table_export)
10173
10174    def transcripts_prioritization(
10175        self, transcripts_table: str = None, param: dict = {}
10176    ) -> bool:
10177        """
10178        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
10179        and updates the variants table with the prioritized information.
10180
10181        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10182        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
10183        This parameter is used to identify the table where the transcripts data is stored for the
10184        prioritization process
10185        :type transcripts_table: str
10186        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
10187        that contains various configuration settings for the prioritization process of transcripts. It
10188        is used to customize the behavior of the prioritization algorithm and includes settings such as
10189        the prefix for prioritization fields, default profiles, and other
10190        :type param: dict
10191        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
10192        transcripts prioritization process is successfully completed, and `False` if there are any
10193        issues or if no profile is defined for transcripts prioritization.
10194        """
10195
10196        log.debug("Start transcripts prioritization...")
10197
10198        # Param
10199        if not param:
10200            param = self.get_param()
10201
10202        # Variants table
10203        table_variants = self.get_table_variants()
10204
10205        # Transcripts table
10206        if transcripts_table is None:
10207            transcripts_table = self.create_transcript_view(
10208                transcripts_table="transcripts", param=param
10209            )
10210        if transcripts_table is None:
10211            msg_err = "No Transcripts table availalble"
10212            log.error(msg_err)
10213            raise ValueError(msg_err)
10214        log.debug(f"transcripts_table={transcripts_table}")
10215
10216        # Get transcripts columns
10217        columns_as_list_query = f"""
10218            DESCRIBE {transcripts_table}
10219        """
10220        columns_as_list = list(
10221            self.get_query_to_df(columns_as_list_query)["column_name"]
10222        )
10223
10224        # Create INFO if not exists
10225        if "INFO" not in columns_as_list:
10226            query_add_info = f"""
10227                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
10228            """
10229            self.execute_query(query_add_info)
10230
10231        # Prioritization param and Force only PZ Score and Flag
10232        pz_param = param.get("transcripts", {}).get("prioritization", {})
10233
10234        # PZ profile by default
10235        pz_profile_default = (
10236            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
10237        )
10238
10239        # Exit if no profile
10240        if pz_profile_default is None:
10241            log.warning("No profile defined for transcripts prioritization")
10242            return False
10243
10244        # PZ fields
10245        pz_param_pzfields = {}
10246
10247        # PZ field transcripts
10248        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
10249
10250        # Add PZ Transcript in header
10251        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
10252            pz_fields_transcripts,
10253            ".",
10254            "String",
10255            f"Transcript selected from prioritization process, profile {pz_profile_default}",
10256            "unknown",
10257            "unknown",
10258            code_type_map["String"],
10259        )
10260
10261        # Mandatory fields
10262        pz_mandatory_fields_list = [
10263            "Score",
10264            "Flag",
10265            "Tags",
10266            "Comment",
10267            "Infos",
10268            "Class",
10269        ]
10270        pz_mandatory_fields = []
10271        for pz_mandatory_field in pz_mandatory_fields_list:
10272            pz_mandatory_fields.append(
10273                pz_param.get("pzprefix", "PTZ") + pz_mandatory_field
10274            )
10275
10276        # PZ fields in param
10277        for pz_field in pz_param.get("pzfields", []):
10278            if pz_field in pz_mandatory_fields_list:
10279                pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = (
10280                    pz_param.get("pzprefix", "PTZ") + pz_field
10281                )
10282            else:
10283                pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field
10284                pz_param_pzfields[pz_field] = pz_field_new
10285
10286                # Add PZ Transcript in header
10287                self.get_header().infos[pz_field_new] = vcf.parser._Info(
10288                    pz_field_new,
10289                    ".",
10290                    "String",
10291                    f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}",
10292                    "unknown",
10293                    "unknown",
10294                    code_type_map["String"],
10295                )
10296
10297        # PZ fields param
10298        pz_param["pzfields"] = pz_mandatory_fields
10299
10300        # Prioritization
10301        prioritization_result = self.prioritization(
10302            table=transcripts_table,
10303            pz_param=param.get("transcripts", {}).get("prioritization", {}),
10304        )
10305        if not prioritization_result:
10306            log.warning("Transcripts prioritization not processed")
10307            return False
10308
10309        # PZ fields sql query
10310        query_update_select_list = []
10311        query_update_concat_list = []
10312        query_update_order_list = []
10313        for pz_param_pzfield in set(
10314            list(pz_param_pzfields.keys()) + pz_mandatory_fields
10315        ):
10316            query_update_select_list.append(f" {pz_param_pzfield}, ")
10317
10318        for pz_param_pzfield in pz_param_pzfields:
10319            query_update_concat_list.append(
10320                f"""
10321                    , CASE 
10322                        WHEN {pz_param_pzfield} IS NOT NULL
10323                        THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield})
10324                        ELSE ''
10325                    END
10326                """
10327            )
10328
10329        # Order by
10330        pz_orders = (
10331            param.get("transcripts", {})
10332            .get("prioritization", {})
10333            .get("prioritization_transcripts_order", {})
10334        )
10335        if not pz_orders:
10336            pz_orders = {
10337                pz_param.get("pzprefix", "PTZ") + "Flag": "DESC",
10338                pz_param.get("pzprefix", "PTZ") + "Score": "DESC",
10339            }
10340        for pz_order in pz_orders:
10341            query_update_order_list.append(
10342                f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """
10343            )
10344
10345        # Fields to explode
10346        fields_to_explode = (
10347            list(pz_param_pzfields.keys())
10348            + pz_mandatory_fields
10349            + list(pz_orders.keys())
10350        )
10351        # Remove transcript column as a specific transcript column
10352        if "transcript" in fields_to_explode:
10353            fields_to_explode.remove("transcript")
10354
10355        # Fields intranscripts table
10356        query_transcripts_table = f"""
10357            DESCRIBE SELECT * FROM {transcripts_table}
10358        """
10359        query_transcripts_table = self.get_query_to_df(query=query_transcripts_table)
10360
10361        # Check fields to explode
10362        for field_to_explode in fields_to_explode:
10363            if field_to_explode not in self.get_header_infos_list() + list(
10364                query_transcripts_table.column_name
10365            ):
10366                msg_err = f"INFO/{field_to_explode} NOT IN header"
10367                log.error(msg_err)
10368                raise ValueError(msg_err)
10369
10370        # Explode fields to explode
10371        self.explode_infos(
10372            table=transcripts_table,
10373            fields=fields_to_explode,
10374        )
10375
10376        # Transcript preference file
10377        transcripts_preference_file = (
10378            param.get("transcripts", {})
10379            .get("prioritization", {})
10380            .get("prioritization_transcripts", {})
10381        )
10382        transcripts_preference_file = full_path(transcripts_preference_file)
10383
10384        # Transcript preference forced
10385        transcript_preference_force = (
10386            param.get("transcripts", {})
10387            .get("prioritization", {})
10388            .get("prioritization_transcripts_force", False)
10389        )
10390        # Transcript version forced
10391        transcript_version_force = (
10392            param.get("transcripts", {})
10393            .get("prioritization", {})
10394            .get("prioritization_transcripts_version_force", False)
10395        )
10396
10397        # Transcripts Ranking
10398        if transcripts_preference_file:
10399
10400            # Transcripts file to dataframe
10401            if os.path.exists(transcripts_preference_file):
10402                transcripts_preference_dataframe = transcripts_file_to_df(
10403                    transcripts_preference_file
10404                )
10405            else:
10406                log.error(
10407                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10408                )
10409                raise ValueError(
10410                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10411                )
10412
10413            # Order by depending to transcript preference forcing
10414            if transcript_preference_force:
10415                order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """
10416            else:
10417                order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """
10418
10419            # Transcript columns joined depend on version consideration
10420            if transcript_version_force:
10421                transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """
10422            else:
10423                transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """
10424
10425            # Query ranking for update
10426            query_update_ranking = f"""
10427                SELECT
10428                    "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)}
10429                    ROW_NUMBER() OVER (
10430                        PARTITION BY "#CHROM", POS, REF, ALT
10431                        ORDER BY {order_by}
10432                    ) AS rn
10433                FROM {transcripts_table}
10434                LEFT JOIN 
10435                    (
10436                        SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order
10437                        FROM transcripts_preference_dataframe
10438                    ) AS transcripts_preference
10439                ON {transcripts_version_join}
10440            """
10441
10442        else:
10443
10444            # Query ranking for update
10445            query_update_ranking = f"""
10446                SELECT
10447                    "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)}
10448                    ROW_NUMBER() OVER (
10449                        PARTITION BY "#CHROM", POS, REF, ALT
10450                        ORDER BY {" , ".join(query_update_order_list)}
10451                    ) AS rn
10452                FROM {transcripts_table}
10453            """
10454
10455        # Export Transcripts prioritization infos to variants table
10456        query_update = f"""
10457            WITH RankedTranscripts AS (
10458                {query_update_ranking}
10459            )
10460            UPDATE {table_variants}
10461                SET
10462                INFO = CONCAT(CASE
10463                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10464                            THEN ''
10465                            ELSE concat("INFO", ';')
10466                        END,
10467                        concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)})
10468                        )
10469            FROM
10470                RankedTranscripts
10471            WHERE
10472                rn = 1
10473                AND variants."#CHROM" = RankedTranscripts."#CHROM"
10474                AND variants."POS" = RankedTranscripts."POS"
10475                AND variants."REF" = RankedTranscripts."REF"
10476                AND variants."ALT" = RankedTranscripts."ALT"     
10477        """
10478
10479        # log.debug(f"query_update={query_update}")
10480        self.execute_query(query=query_update)
10481
10482        # Return
10483        return True
10484
10485    def create_transcript_view_from_columns_map(
10486        self,
10487        transcripts_table: str = "transcripts",
10488        columns_maps: dict = {},
10489        added_columns: list = [],
10490        temporary_tables: list = None,
10491        annotation_fields: list = None,
10492        column_rename: dict = {},
10493        column_clean: bool = False,
10494        column_case: str = None,
10495    ) -> tuple[list, list, list]:
10496        """
10497        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
10498        specified columns mapping for transcripts data.
10499
10500        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10501        of the table where the transcripts data is stored or will be stored in the database. This table
10502        typically contains information about transcripts such as Ensembl transcript IDs, gene names,
10503        scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
10504        :type transcripts_table: str (optional)
10505        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information
10506        about how to map columns from a transcripts table to create a view. Each entry in the
10507        `columns_maps` list represents a mapping configuration for a specific set of columns. It
10508        typically includes details such as the main transcript column and additional information columns
10509        :type columns_maps: dict
10510        :param added_columns: The `added_columns` parameter in the
10511        `create_transcript_view_from_columns_map` function is a list that stores the additional columns
10512        that will be added to the view being created based on the columns map provided. These columns
10513        are generated by exploding the transcript information columns along with the main transcript
10514        column
10515        :type added_columns: list
10516        :param temporary_tables: The `temporary_tables` parameter in the
10517        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
10518        tables created during the process of creating a transcript view from a columns map. These
10519        temporary tables are used to store intermediate results or transformations before the final view
10520        is generated
10521        :type temporary_tables: list
10522        :param annotation_fields: The `annotation_fields` parameter in the
10523        `create_transcript_view_from_columns_map` function is a list that stores the fields that are
10524        used for annotation in the query view creation process. These fields are extracted from the
10525        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
10526        :type annotation_fields: list
10527        :param column_rename: The `column_rename` parameter in the
10528        `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify
10529        custom renaming for columns during the creation of the temporary table view. This parameter
10530        provides a mapping of original column names to the desired renamed column names. By using this
10531        parameter,
10532        :type column_rename: dict
10533        :param column_clean: The `column_clean` parameter in the
10534        `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the
10535        column values should be cleaned or not. If set to `True`, the column values will be cleaned by
10536        removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to
10537        False
10538        :type column_clean: bool (optional)
10539        :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map`
10540        function is used to specify the case transformation to be applied to the columns during the view
10541        creation process. It allows you to control whether the column values should be converted to
10542        lowercase, uppercase, or remain unchanged
10543        :type column_case: str
10544        :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three
10545        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
10546        """
10547
10548        log.debug("Start transcrpts view creation from columns map...")
10549
10550        # "from_columns_map": [
10551        #     {
10552        #         "transcripts_column": "Ensembl_transcriptid",
10553        #         "transcripts_infos_columns": [
10554        #             "genename",
10555        #             "Ensembl_geneid",
10556        #             "LIST_S2_score",
10557        #             "LIST_S2_pred",
10558        #         ],
10559        #     },
10560        #     {
10561        #         "transcripts_column": "Ensembl_transcriptid",
10562        #         "transcripts_infos_columns": [
10563        #             "genename",
10564        #             "VARITY_R_score",
10565        #             "Aloft_pred",
10566        #         ],
10567        #     },
10568        # ],
10569
10570        # Init
10571        if temporary_tables is None:
10572            temporary_tables = []
10573        if annotation_fields is None:
10574            annotation_fields = []
10575
10576        # Variants table
10577        table_variants = self.get_table_variants()
10578
10579        for columns_map in columns_maps:
10580
10581            # Transcript column
10582            transcripts_column = columns_map.get("transcripts_column", None)
10583
10584            # Transcripts infos columns
10585            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
10586
10587            # Transcripts infos columns rename
10588            column_rename = columns_map.get("column_rename", column_rename)
10589
10590            # Transcripts infos columns clean
10591            column_clean = columns_map.get("column_clean", column_clean)
10592
10593            # Transcripts infos columns case
10594            column_case = columns_map.get("column_case", column_case)
10595
10596            if transcripts_column is not None:
10597
10598                # Explode
10599                added_columns += self.explode_infos(
10600                    fields=[transcripts_column] + transcripts_infos_columns
10601                )
10602
10603                # View clauses
10604                clause_select_variants = []
10605                clause_select_tanscripts = []
10606                for field in [transcripts_column] + transcripts_infos_columns:
10607
10608                    # AS field
10609                    as_field = field
10610
10611                    # Rename
10612                    if column_rename:
10613                        as_field = column_rename.get(as_field, as_field)
10614
10615                    # Clean
10616                    if column_clean:
10617                        as_field = clean_annotation_field(as_field)
10618
10619                    # Case
10620                    if column_case:
10621                        if column_case.lower() in ["lower"]:
10622                            as_field = as_field.lower()
10623                        elif column_case.lower() in ["upper"]:
10624                            as_field = as_field.upper()
10625
10626                    # Clause select Variants
10627                    clause_select_variants.append(
10628                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10629                    )
10630
10631                    if field in [transcripts_column]:
10632                        clause_select_tanscripts.append(
10633                            f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10634                        )
10635                    else:
10636                        clause_select_tanscripts.append(
10637                            f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """
10638                        )
10639                        annotation_fields.append(as_field)
10640
10641                # Querey View
10642                query = f""" 
10643                    SELECT
10644                        "#CHROM", POS, REF, ALT, INFO,
10645                        "{transcripts_column}" AS 'transcript',
10646                        {", ".join(clause_select_tanscripts)}
10647                    FROM (
10648                        SELECT 
10649                            "#CHROM", POS, REF, ALT, INFO,
10650                            {", ".join(clause_select_variants)}
10651                        FROM {table_variants}
10652                        )
10653                    WHERE "{transcripts_column}" IS NOT NULL
10654                """
10655
10656                # Create temporary table
10657                temporary_table = transcripts_table + "".join(
10658                    random.choices(string.ascii_uppercase + string.digits, k=10)
10659                )
10660
10661                # Temporary_tables
10662                temporary_tables.append(temporary_table)
10663                query_view = f"""
10664                    CREATE TEMPORARY TABLE {temporary_table}
10665                    AS ({query})
10666                """
10667                self.execute_query(query=query_view)
10668
10669        return added_columns, temporary_tables, annotation_fields
10670
10671    def create_transcript_view_from_column_format(
10672        self,
10673        transcripts_table: str = "transcripts",
10674        column_formats: dict = {},
10675        temporary_tables: list = None,
10676        annotation_fields: list = None,
10677        column_rename: dict = {},
10678        column_clean: bool = False,
10679        column_case: str = None,
10680    ) -> tuple[list, list, list]:
10681        """
10682        The `create_transcript_view_from_column_format` function generates a transcript view based on
10683        specified column formats, adds additional columns and annotation fields, and returns the list of
10684        temporary tables and annotation fields.
10685
10686        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10687        of the table containing the transcripts data. This table will be used as the base table for
10688        creating the transcript view. The default value for this parameter is "transcripts", but you can
10689        provide a different table name if needed, defaults to transcripts
10690        :type transcripts_table: str (optional)
10691        :param column_formats: The `column_formats` parameter is a dictionary that contains information
10692        about the columns to be used for creating the transcript view. Each entry in the dictionary
10693        specifies the mapping between a transcripts column and a transcripts infos column. This
10694        parameter allows you to define how the columns from the transcripts table should be transformed
10695        or mapped
10696        :type column_formats: dict
10697        :param temporary_tables: The `temporary_tables` parameter in the
10698        `create_transcript_view_from_column_format` function is a list that stores the names of
10699        temporary views created during the process of creating a transcript view from a column format.
10700        These temporary views are used to manipulate and extract data before generating the final
10701        transcript view
10702        :type temporary_tables: list
10703        :param annotation_fields: The `annotation_fields` parameter in the
10704        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
10705        that are extracted from the temporary views created during the process. These annotation fields
10706        are obtained by querying the temporary views and extracting the column names excluding specific
10707        columns like `#CH
10708        :type annotation_fields: list
10709        :param column_rename: The `column_rename` parameter in the
10710        `create_transcript_view_from_column_format` function is a dictionary that allows you to specify
10711        custom renaming of columns in the transcripts infos table. By providing a mapping of original
10712        column names to new column names in this dictionary, you can rename specific columns during the
10713        process
10714        :type column_rename: dict
10715        :param column_clean: The `column_clean` parameter in the
10716        `create_transcript_view_from_column_format` function is a boolean flag that determines whether
10717        the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns
10718        will be cleaned during the creation of the transcript view based on the specified column format,
10719        defaults to False
10720        :type column_clean: bool (optional)
10721        :param column_case: The `column_case` parameter in the
10722        `create_transcript_view_from_column_format` function is used to specify the case transformation
10723        to be applied to the columns in the transcript view. It can be set to either "upper" or "lower"
10724        to convert the column names to uppercase or lowercase, respectively
10725        :type column_case: str
10726        :return: The `create_transcript_view_from_column_format` function returns two lists:
10727        `temporary_tables` and `annotation_fields`.
10728        """
10729
10730        log.debug("Start transcrpts view creation from column format...")
10731
10732        #  "from_column_format": [
10733        #     {
10734        #         "transcripts_column": "ANN",
10735        #         "transcripts_infos_column": "Feature_ID",
10736        #     }
10737        # ],
10738
10739        # Init
10740        if temporary_tables is None:
10741            temporary_tables = []
10742        if annotation_fields is None:
10743            annotation_fields = []
10744
10745        for column_format in column_formats:
10746
10747            # annotation field and transcript annotation field
10748            annotation_field = column_format.get("transcripts_column", "ANN")
10749            transcript_annotation = column_format.get(
10750                "transcripts_infos_column", "Feature_ID"
10751            )
10752
10753            # Transcripts infos columns rename
10754            column_rename = column_format.get("column_rename", column_rename)
10755
10756            # Transcripts infos columns clean
10757            column_clean = column_format.get("column_clean", column_clean)
10758
10759            # Transcripts infos columns case
10760            column_case = column_format.get("column_case", column_case)
10761
10762            # Temporary View name
10763            temporary_view_name = transcripts_table + "".join(
10764                random.choices(string.ascii_uppercase + string.digits, k=10)
10765            )
10766
10767            # Create temporary view name
10768            temporary_view_name = self.annotation_format_to_table(
10769                uniquify=True,
10770                annotation_field=annotation_field,
10771                view_name=temporary_view_name,
10772                annotation_id=transcript_annotation,
10773                column_rename=column_rename,
10774                column_clean=column_clean,
10775                column_case=column_case,
10776            )
10777
10778            # Annotation fields
10779            if temporary_view_name:
10780                query_annotation_fields = f"""
10781                    SELECT *
10782                    FROM (
10783                        DESCRIBE SELECT *
10784                        FROM {temporary_view_name}
10785                        )
10786                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
10787                """
10788                df_annotation_fields = self.get_query_to_df(
10789                    query=query_annotation_fields
10790                )
10791
10792                # Add temporary view and annotation fields
10793                temporary_tables.append(temporary_view_name)
10794                annotation_fields += list(set(df_annotation_fields["column_name"]))
10795
10796        return temporary_tables, annotation_fields
10797
10798    def create_transcript_view(
10799        self,
10800        transcripts_table: str = None,
10801        transcripts_table_drop: bool = False,
10802        param: dict = {},
10803    ) -> str:
10804        """
10805        The `create_transcript_view` function generates a transcript view by processing data from a
10806        specified table based on provided parameters and structural information.
10807
10808        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
10809        is used to specify the name of the table that will store the final transcript view data. If a table
10810        name is not provided, the function will create a new table to store the transcript view data, and by
10811        default,, defaults to transcripts
10812        :type transcripts_table: str (optional)
10813        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
10814        `create_transcript_view` function is a boolean parameter that determines whether to drop the
10815        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
10816        the function will drop the existing transcripts table if it exists, defaults to False
10817        :type transcripts_table_drop: bool (optional)
10818        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
10819        contains information needed to create a transcript view. It includes details such as the structure
10820        of the transcripts, columns mapping, column formats, and other necessary information for generating
10821        the view. This parameter allows for flexibility and customization
10822        :type param: dict
10823        :return: The `create_transcript_view` function returns the name of the transcripts table that was
10824        created or modified during the execution of the function.
10825        """
10826
10827        log.debug("Start transcripts view creation...")
10828
10829        # Default
10830        transcripts_table_default = "transcripts"
10831
10832        # Param
10833        if not param:
10834            param = self.get_param()
10835
10836        # Struct
10837        struct = param.get("transcripts", {}).get("struct", None)
10838
10839        # Transcript veresion
10840        transcript_id_remove_version = param.get("transcripts", {}).get(
10841            "transcript_id_remove_version", False
10842        )
10843
10844        # Transcripts mapping
10845        transcript_id_mapping_file = param.get("transcripts", {}).get(
10846            "transcript_id_mapping_file", None
10847        )
10848
10849        # Transcripts mapping
10850        transcript_id_mapping_force = param.get("transcripts", {}).get(
10851            "transcript_id_mapping_force", None
10852        )
10853
10854        if struct:
10855
10856            # Transcripts table
10857            if transcripts_table is None:
10858                transcripts_table = param.get("transcripts", {}).get(
10859                    "table", transcripts_table_default
10860                )
10861
10862            # added_columns
10863            added_columns = []
10864
10865            # Temporary tables
10866            temporary_tables = []
10867
10868            # Annotation fields
10869            annotation_fields = []
10870
10871            # from columns map
10872            columns_maps = struct.get("from_columns_map", [])
10873            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10874                self.create_transcript_view_from_columns_map(
10875                    transcripts_table=transcripts_table,
10876                    columns_maps=columns_maps,
10877                    added_columns=added_columns,
10878                    temporary_tables=temporary_tables,
10879                    annotation_fields=annotation_fields,
10880                )
10881            )
10882            added_columns += added_columns_tmp
10883            temporary_tables += temporary_tables_tmp
10884            annotation_fields += annotation_fields_tmp
10885
10886            # from column format
10887            column_formats = struct.get("from_column_format", [])
10888            temporary_tables_tmp, annotation_fields_tmp = (
10889                self.create_transcript_view_from_column_format(
10890                    transcripts_table=transcripts_table,
10891                    column_formats=column_formats,
10892                    temporary_tables=temporary_tables,
10893                    annotation_fields=annotation_fields,
10894                )
10895            )
10896            temporary_tables += temporary_tables_tmp
10897            annotation_fields += annotation_fields_tmp
10898
10899            # Remove some specific fields/column
10900            annotation_fields = list(set(annotation_fields))
10901            for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]:
10902                if field in annotation_fields:
10903                    annotation_fields.remove(field)
10904
10905            # Merge temporary tables query
10906            query_merge = ""
10907            for temporary_table in list(set(temporary_tables)):
10908
10909                # First temporary table
10910                if not query_merge:
10911                    query_merge = f"""
10912                        SELECT * FROM {temporary_table}
10913                    """
10914                # other temporary table (using UNION)
10915                else:
10916                    query_merge += f"""
10917                        UNION BY NAME SELECT * FROM {temporary_table}
10918                    """
10919
10920            # transcript table tmp
10921            transcript_table_tmp = "transcripts_tmp"
10922            transcript_table_tmp2 = "transcripts_tmp2"
10923            transcript_table_tmp3 = "transcripts_tmp3"
10924
10925            # Merge on transcript
10926            query_merge_on_transcripts_annotation_fields = []
10927
10928            # Add transcript list
10929            query_merge_on_transcripts_annotation_fields.append(
10930                f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """
10931            )
10932
10933            # Aggregate all annotations fields
10934            for annotation_field in set(annotation_fields):
10935                query_merge_on_transcripts_annotation_fields.append(
10936                    f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """
10937                )
10938
10939            # Transcripts mapping
10940            if transcript_id_mapping_file:
10941
10942                # Transcript dataframe
10943                transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe"
10944                transcript_id_mapping_dataframe = transcripts_file_to_df(
10945                    transcript_id_mapping_file, column_names=["transcript", "alias"]
10946                )
10947
10948                # Transcript version remove
10949                if transcript_id_remove_version:
10950                    query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped"
10951                    query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)"
10952                    query_left_join = f"""
10953                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10954                    """
10955                else:
10956                    query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped"
10957                    query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript"
10958                    query_left_join = f"""
10959                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10960                    """
10961
10962                # Transcript column for group by merge
10963                query_transcript_merge_group_by = """
10964                        CASE
10965                            WHEN transcript_mapped NOT IN ('')
10966                            THEN split_part(transcript_mapped, '.', 1)
10967                            ELSE split_part(transcript_original, '.', 1)
10968                        END
10969                    """
10970
10971                # Merge query
10972                transcripts_tmp2_query = f"""
10973                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)}
10974                    FROM ({query_merge}) AS {transcript_table_tmp}
10975                    {query_left_join}
10976                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by}
10977                """
10978
10979                # Retrive columns after mege
10980                transcripts_tmp2_describe_query = f"""
10981                    DESCRIBE {transcripts_tmp2_query}
10982                """
10983                transcripts_tmp2_describe_list = list(
10984                    self.get_query_to_df(query=transcripts_tmp2_describe_query)[
10985                        "column_name"
10986                    ]
10987                )
10988
10989                # Create list of columns for select clause
10990                transcripts_tmp2_describe_select_clause = []
10991                for field in transcripts_tmp2_describe_list:
10992                    if field not in [
10993                        "#CHROM",
10994                        "POS",
10995                        "REF",
10996                        "ALT",
10997                        "INFO",
10998                        "transcript_mapped",
10999                    ]:
11000                        as_field = field
11001                        if field in ["transcript_original"]:
11002                            as_field = "transcripts_mapped"
11003                        transcripts_tmp2_describe_select_clause.append(
11004                            f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """
11005                        )
11006
11007                # Merge with mapping
11008                query_merge_on_transcripts = f"""
11009                    SELECT
11010                        "#CHROM", POS, REF, ALT, INFO,
11011                        CASE
11012                            WHEN ANY_VALUE(transcript_mapped) NOT IN ('')
11013                            THEN ANY_VALUE(transcript_mapped)
11014                            ELSE ANY_VALUE(transcript_original)
11015                        END AS transcript,
11016                        {", ".join(transcripts_tmp2_describe_select_clause)}
11017                    FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2}
11018                    GROUP BY "#CHROM", POS, REF, ALT, INFO,
11019                        {query_transcript_merge_group_by}
11020                """
11021
11022                # Add transcript filter from mapping file
11023                if transcript_id_mapping_force:
11024                    query_merge_on_transcripts = f"""
11025                        SELECT *
11026                        FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3}
11027                        WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe)
11028                    """
11029
11030            # No transcript mapping
11031            else:
11032
11033                # Remove transcript version
11034                if transcript_id_remove_version:
11035                    query_transcript_column = f"""
11036                        split_part({transcript_table_tmp}.transcript, '.', 1)
11037                    """
11038                else:
11039                    query_transcript_column = """
11040                        transcript
11041                    """
11042
11043                # Query sections
11044                query_transcript_column_select = (
11045                    f"{query_transcript_column} AS transcript"
11046                )
11047                query_transcript_column_group_by = query_transcript_column
11048
11049                # Query for transcripts view
11050                query_merge_on_transcripts = f"""
11051                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)}
11052                    FROM ({query_merge}) AS {transcript_table_tmp}
11053                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column}
11054                """
11055
11056            log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}")
11057
11058            # Drop transcript view is necessary
11059            if transcripts_table_drop:
11060                query_drop = f"""
11061                    DROP TABLE IF EXISTS {transcripts_table};
11062                """
11063                self.execute_query(query=query_drop)
11064
11065            # Merge and create transcript view
11066            query_create_view = f"""
11067                CREATE TABLE IF NOT EXISTS {transcripts_table}
11068                AS {query_merge_on_transcripts}
11069            """
11070            self.execute_query(query=query_create_view)
11071
11072            # Remove added columns
11073            for added_column in added_columns:
11074                self.drop_column(column=added_column)
11075
11076        else:
11077
11078            transcripts_table = None
11079
11080        return transcripts_table
11081
11082    def annotation_format_to_table(
11083        self,
11084        uniquify: bool = True,
11085        annotation_field: str = "ANN",
11086        annotation_id: str = "Feature_ID",
11087        view_name: str = "transcripts",
11088        column_rename: dict = {},
11089        column_clean: bool = False,
11090        column_case: str = None,
11091    ) -> str:
11092        """
11093        The `annotation_format_to_table` function converts annotation data from a VCF file into a
11094        structured table format, ensuring unique values and creating a temporary table for further
11095        processing or analysis.
11096
11097        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure
11098        unique values in the output or not. If set to `True`, the function will make sure that the
11099        output values are unique, defaults to True
11100        :type uniquify: bool (optional)
11101        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file
11102        that contains the annotation information for each variant. This field is used to extract the
11103        annotation details for further processing in the function. By default, it is set to "ANN",
11104        defaults to ANN
11105        :type annotation_field: str (optional)
11106        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method
11107        is used to specify the identifier for the annotation feature. This identifier will be used as a
11108        column name in the resulting table or view that is created based on the annotation data. It
11109        helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
11110        :type annotation_id: str (optional)
11111        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used
11112        to specify the name of the temporary table that will be created to store the transformed
11113        annotation data. This table will hold the extracted information from the annotation field in a
11114        structured format for further processing or analysis. By default,, defaults to transcripts
11115        :type view_name: str (optional)
11116        :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method
11117        is a dictionary that allows you to specify custom renaming for columns. By providing key-value
11118        pairs in this dictionary, you can rename specific columns in the resulting table or view that is
11119        created based on the annotation data. This feature enables
11120        :type column_rename: dict
11121        :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is
11122        a boolean flag that determines whether the annotation field should undergo a cleaning process.
11123        If set to `True`, the function will clean the annotation field before further processing. This
11124        cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults
11125        to False
11126        :type column_clean: bool (optional)
11127        :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is
11128        used to specify the case transformation to be applied to the column names extracted from the
11129        annotation data. It allows you to set the case of the column names to either lowercase or
11130        uppercase for consistency or other specific requirements during the conversion
11131        :type column_case: str
11132        :return: The function `annotation_format_to_table` is returning the name of the view created,
11133        which is stored in the variable `view_name`.
11134        """
11135
11136        # Annotation field
11137        annotation_format = "annotation_explode"
11138
11139        # Transcript annotation
11140        if column_rename:
11141            annotation_id = column_rename.get(annotation_id, annotation_id)
11142
11143        if column_clean:
11144            annotation_id = clean_annotation_field(annotation_id)
11145
11146        # Prefix
11147        prefix = self.get_explode_infos_prefix()
11148        if prefix:
11149            prefix = "INFO/"
11150
11151        # Annotation fields
11152        annotation_infos = prefix + annotation_field
11153        annotation_format_infos = prefix + annotation_format
11154
11155        # Variants table
11156        table_variants = self.get_table_variants()
11157
11158        # Header
11159        vcf_reader = self.get_header()
11160
11161        # Add columns
11162        added_columns = []
11163
11164        # Explode HGVS field in column
11165        added_columns += self.explode_infos(fields=[annotation_field])
11166
11167        if annotation_field in vcf_reader.infos:
11168
11169            # Extract ANN header
11170            ann_description = vcf_reader.infos[annotation_field].desc
11171            pattern = r"'(.+?)'"
11172            match = re.search(pattern, ann_description)
11173            if match:
11174                ann_header_match = match.group(1).split(" | ")
11175                ann_header = []
11176                ann_header_desc = {}
11177                for i in range(len(ann_header_match)):
11178                    ann_header_info = "".join(
11179                        char for char in ann_header_match[i] if char.isalnum()
11180                    )
11181                    ann_header.append(ann_header_info)
11182                    ann_header_desc[ann_header_info] = ann_header_match[i]
11183                if not ann_header_desc:
11184                    raise ValueError("Invalid header description format")
11185            else:
11186                raise ValueError("Invalid header description format")
11187
11188            # Create variant id
11189            variant_id_column = self.get_variant_id_column()
11190            added_columns += [variant_id_column]
11191
11192            # Create dataframe
11193            dataframe_annotation_format = self.get_query_to_df(
11194                f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
11195            )
11196
11197            # Create annotation columns
11198            dataframe_annotation_format[
11199                annotation_format_infos
11200            ] = dataframe_annotation_format[annotation_infos].apply(
11201                lambda x: explode_annotation_format(
11202                    annotation=str(x),
11203                    uniquify=uniquify,
11204                    output_format="JSON",
11205                    prefix="",
11206                    header=list(ann_header_desc.values()),
11207                )
11208            )
11209
11210            # Find keys
11211            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
11212            df_keys = self.get_query_to_df(query=query_json)
11213
11214            # Check keys
11215            query_json_key = []
11216            for _, row in df_keys.iterrows():
11217
11218                # Key
11219                key = row.iloc[0]
11220                key_clean = key
11221
11222                # key rename
11223                if column_rename:
11224                    key_clean = column_rename.get(key_clean, key_clean)
11225
11226                # key clean
11227                if column_clean:
11228                    key_clean = clean_annotation_field(key_clean)
11229
11230                # Key case
11231                if column_case:
11232                    if column_case.lower() in ["lower"]:
11233                        key_clean = key_clean.lower()
11234                    elif column_case.lower() in ["upper"]:
11235                        key_clean = key_clean.upper()
11236
11237                # Type
11238                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
11239
11240                # Get DataFrame from query
11241                df_json_type = self.get_query_to_df(query=query_json_type)
11242
11243                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
11244                with pd.option_context("future.no_silent_downcasting", True):
11245                    df_json_type.fillna(value="", inplace=True)
11246                    replace_dict = {None: np.nan, "": np.nan}
11247                    df_json_type.replace(replace_dict, inplace=True)
11248                    df_json_type.dropna(inplace=True)
11249
11250                # Detect column type
11251                column_type = detect_column_type(df_json_type[key_clean])
11252
11253                # Append
11254                query_json_key.append(
11255                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
11256                )
11257
11258            # Create view
11259            query_view = f"""
11260                CREATE TEMPORARY TABLE {view_name}
11261                AS (
11262                    SELECT *, {annotation_id} AS 'transcript'
11263                    FROM (
11264                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
11265                        FROM dataframe_annotation_format
11266                        )
11267                    );
11268            """
11269            self.execute_query(query=query_view)
11270
11271        else:
11272
11273            # Return None
11274            view_name = None
11275
11276        # Remove added columns
11277        for added_column in added_columns:
11278            self.drop_column(column=added_column)
11279
11280        return view_name
11281
11282    def transcript_view_to_variants(
11283        self,
11284        transcripts_table: str = None,
11285        transcripts_column_id: str = None,
11286        transcripts_info_json: str = None,
11287        transcripts_info_field_json: str = None,
11288        transcripts_info_format: str = None,
11289        transcripts_info_field_format: str = None,
11290        param: dict = {},
11291    ) -> bool:
11292        """
11293        The `transcript_view_to_variants` function updates a variants table with information from
11294        transcripts in JSON format.
11295
11296        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
11297        table containing the transcripts data. If this parameter is not provided, the function will
11298        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
11299        :type transcripts_table: str
11300        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
11301        column in the `transcripts_table` that contains the unique identifier for each transcript. This
11302        identifier is used to match transcripts with variants in the database
11303        :type transcripts_column_id: str
11304        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
11305        of the column in the variants table where the transcripts information will be stored in JSON
11306        format. This parameter allows you to define the column in the variants table that will hold the
11307        JSON-formatted information about transcripts
11308        :type transcripts_info_json: str
11309        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
11310        specify the field in the VCF header that will contain information about transcripts in JSON
11311        format. This field will be added to the VCF header as an INFO field with the specified name
11312        :type transcripts_info_field_json: str
11313        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
11314        format of the information about transcripts that will be stored in the variants table. This
11315        format can be used to define how the transcript information will be structured or displayed
11316        within the variants table
11317        :type transcripts_info_format: str
11318        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
11319        specify the field in the VCF header that will contain information about transcripts in a
11320        specific format. This field will be added to the VCF header as an INFO field with the specified
11321        name
11322        :type transcripts_info_field_format: str
11323        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
11324        that contains various configuration settings related to transcripts. It is used to provide
11325        default values for certain parameters if they are not explicitly provided when calling the
11326        method. The `param` dictionary can be passed as an argument
11327        :type param: dict
11328        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
11329        if the operation is successful and `False` if certain conditions are not met.
11330        """
11331
11332        msg_info_prefix = "Start transcripts view to variants annotations"
11333
11334        log.debug(f"{msg_info_prefix}...")
11335
11336        # Default
11337        transcripts_table_default = "transcripts"
11338        transcripts_column_id_default = "transcript"
11339        transcripts_info_json_default = None
11340        transcripts_info_format_default = None
11341        transcripts_info_field_json_default = None
11342        transcripts_info_field_format_default = None
11343
11344        # Param
11345        if not param:
11346            param = self.get_param()
11347
11348        # Transcripts table
11349        if transcripts_table is None:
11350            transcripts_table = param.get("transcripts", {}).get(
11351                "table", transcripts_table_default
11352            )
11353
11354        # Transcripts column ID
11355        if transcripts_column_id is None:
11356            transcripts_column_id = param.get("transcripts", {}).get(
11357                "column_id", transcripts_column_id_default
11358            )
11359
11360        # Transcripts info json
11361        if transcripts_info_json is None:
11362            transcripts_info_json = param.get("transcripts", {}).get(
11363                "transcripts_info_json", transcripts_info_json_default
11364            )
11365
11366        # Transcripts info field JSON
11367        if transcripts_info_field_json is None:
11368            transcripts_info_field_json = param.get("transcripts", {}).get(
11369                "transcripts_info_field_json", transcripts_info_field_json_default
11370            )
11371        # if transcripts_info_field_json is not None and transcripts_info_json is None:
11372        #     transcripts_info_json = transcripts_info_field_json
11373
11374        # Transcripts info format
11375        if transcripts_info_format is None:
11376            transcripts_info_format = param.get("transcripts", {}).get(
11377                "transcripts_info_format", transcripts_info_format_default
11378            )
11379
11380        # Transcripts info field FORMAT
11381        if transcripts_info_field_format is None:
11382            transcripts_info_field_format = param.get("transcripts", {}).get(
11383                "transcripts_info_field_format", transcripts_info_field_format_default
11384            )
11385        # if (
11386        #     transcripts_info_field_format is not None
11387        #     and transcripts_info_format is None
11388        # ):
11389        #     transcripts_info_format = transcripts_info_field_format
11390
11391        # Variants table
11392        table_variants = self.get_table_variants()
11393
11394        # Check info columns param
11395        if (
11396            transcripts_info_json is None
11397            and transcripts_info_field_json is None
11398            and transcripts_info_format is None
11399            and transcripts_info_field_format is None
11400        ):
11401            return False
11402
11403        # Transcripts infos columns
11404        query_transcripts_infos_columns = f"""
11405            SELECT *
11406            FROM (
11407                DESCRIBE SELECT * FROM {transcripts_table}
11408                )
11409            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
11410        """
11411        transcripts_infos_columns = list(
11412            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
11413        )
11414
11415        # View results
11416        clause_select = []
11417        clause_to_json = []
11418        clause_to_format = []
11419        for field in transcripts_infos_columns:
11420            # Do not consider INFO field for export into fields
11421            if field not in ["INFO"]:
11422                clause_select.append(
11423                    f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """
11424                )
11425                clause_to_json.append(f""" '{field}': "{field}" """)
11426                clause_to_format.append(f""" "{field}" """)
11427
11428        # Update
11429        update_set_json = []
11430        update_set_format = []
11431
11432        # VCF header
11433        vcf_reader = self.get_header()
11434
11435        # Transcripts to info column in JSON
11436        if transcripts_info_json:
11437
11438            # Create column on variants table
11439            self.add_column(
11440                table_name=table_variants,
11441                column_name=transcripts_info_json,
11442                column_type="JSON",
11443                default_value=None,
11444                drop=False,
11445            )
11446
11447            # Add header
11448            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
11449                transcripts_info_json,
11450                ".",
11451                "String",
11452                "Transcripts in JSON format",
11453                "unknwon",
11454                "unknwon",
11455                self.code_type_map["String"],
11456            )
11457
11458            # Add to update
11459            update_set_json.append(
11460                f""" {transcripts_info_json}=t.{transcripts_info_json} """
11461            )
11462
11463        # Transcripts to info field in JSON
11464        if transcripts_info_field_json:
11465
11466            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
11467
11468            # Add to update
11469            update_set_json.append(
11470                f""" 
11471                    INFO = concat(
11472                            CASE
11473                                WHEN INFO NOT IN ('', '.')
11474                                THEN INFO
11475                                ELSE ''
11476                            END,
11477                            CASE
11478                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
11479                                THEN concat(
11480                                    ';{transcripts_info_field_json}=',
11481                                    t.{transcripts_info_json}
11482                                )
11483                                ELSE ''
11484                            END
11485                            )
11486                """
11487            )
11488
11489            # Add header
11490            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
11491                transcripts_info_field_json,
11492                ".",
11493                "String",
11494                "Transcripts in JSON format",
11495                "unknwon",
11496                "unknwon",
11497                self.code_type_map["String"],
11498            )
11499
11500        if update_set_json:
11501
11502            # Update query
11503            query_update = f"""
11504                UPDATE {table_variants}
11505                    SET {", ".join(update_set_json)}
11506                FROM
11507                (
11508                    SELECT
11509                        "#CHROM", POS, REF, ALT,
11510                            concat(
11511                            '{{',
11512                            string_agg(
11513                                '"' || "{transcripts_column_id}" || '":' ||
11514                                to_json(json_output)
11515                            ),
11516                            '}}'
11517                            )::JSON AS {transcripts_info_json}
11518                    FROM
11519                        (
11520                        SELECT
11521                            "#CHROM", POS, REF, ALT,
11522                            "{transcripts_column_id}",
11523                            to_json(
11524                                {{{",".join(clause_to_json)}}}
11525                            )::JSON AS json_output
11526                        FROM
11527                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11528                        WHERE "{transcripts_column_id}" IS NOT NULL
11529                        )
11530                    GROUP BY "#CHROM", POS, REF, ALT
11531                ) AS t
11532                WHERE {table_variants}."#CHROM" = t."#CHROM"
11533                    AND {table_variants}."POS" = t."POS"
11534                    AND {table_variants}."REF" = t."REF"
11535                    AND {table_variants}."ALT" = t."ALT"
11536            """
11537
11538            self.execute_query(query=query_update)
11539
11540        # Transcripts to info column in FORMAT
11541        if transcripts_info_format:
11542
11543            # Create column on variants table
11544            self.add_column(
11545                table_name=table_variants,
11546                column_name=transcripts_info_format,
11547                column_type="VARCHAR",
11548                default_value=None,
11549                drop=False,
11550            )
11551
11552            # Add header
11553            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
11554                transcripts_info_format,
11555                ".",
11556                "String",
11557                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11558                "unknwon",
11559                "unknwon",
11560                self.code_type_map["String"],
11561            )
11562
11563            # Add to update
11564            update_set_format.append(
11565                f""" {transcripts_info_format}=t.{transcripts_info_format} """
11566            )
11567
11568        else:
11569
11570            # Set variable for internal queries
11571            transcripts_info_format = "transcripts_info_format"
11572
11573        # Transcripts to info field in JSON
11574        if transcripts_info_field_format:
11575
11576            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
11577
11578            # Add to update
11579            update_set_format.append(
11580                f""" 
11581                    INFO = concat(
11582                            CASE
11583                                WHEN INFO NOT IN ('', '.')
11584                                THEN INFO
11585                                ELSE ''
11586                            END,
11587                            CASE
11588                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
11589                                THEN concat(
11590                                    ';{transcripts_info_field_format}=',
11591                                    t.{transcripts_info_format}
11592                                )
11593                                ELSE ''
11594                            END
11595                            )
11596                """
11597            )
11598
11599            # Add header
11600            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
11601                transcripts_info_field_format,
11602                ".",
11603                "String",
11604                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11605                "unknwon",
11606                "unknwon",
11607                self.code_type_map["String"],
11608            )
11609
11610        if update_set_format:
11611
11612            # Update query
11613            query_update = f"""
11614                UPDATE {table_variants}
11615                    SET {", ".join(update_set_format)}
11616                FROM
11617                (
11618                    SELECT
11619                        "#CHROM", POS, REF, ALT,
11620                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
11621                    FROM 
11622                        (
11623                        SELECT
11624                            "#CHROM", POS, REF, ALT,
11625                            "{transcripts_column_id}",
11626                            concat(
11627                                "{transcripts_column_id}",
11628                                '|',
11629                                {", '|', ".join(clause_to_format)}
11630                            ) AS {transcripts_info_format}
11631                        FROM
11632                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11633                        )
11634                    GROUP BY "#CHROM", POS, REF, ALT
11635                ) AS t
11636                WHERE {table_variants}."#CHROM" = t."#CHROM"
11637                    AND {table_variants}."POS" = t."POS"
11638                    AND {table_variants}."REF" = t."REF"
11639                    AND {table_variants}."ALT" = t."ALT"
11640            """
11641
11642            self.execute_query(query=query_update)
11643
11644        return True
11645
11646    def rename_info_fields(
11647        self, fields_to_rename: dict = None, table: str = None
11648    ) -> dict:
11649        """
11650        The `rename_info_fields` function renames specified fields in a VCF file header and updates
11651        corresponding INFO fields in the variants table.
11652
11653        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the
11654        mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary
11655        represent the original field names that need to be renamed, and the corresponding values
11656        represent the new names to which the fields should be
11657        :type fields_to_rename: dict
11658        :param table: The `table` parameter in the `rename_info_fields` function represents the name of
11659        the table in which the variants data is stored. This table contains information about genetic
11660        variants, and the function updates the corresponding INFO fields in this table when renaming
11661        specified fields in the VCF file header
11662        :type table: str
11663        :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains
11664        the original field names as keys and their corresponding new names (or None if the field was
11665        removed) as values after renaming or removing specified fields in a VCF file header and updating
11666        corresponding INFO fields in the variants table.
11667        """
11668
11669        # Init
11670        fields_renamed = {}
11671        config = self.get_config()
11672        access = config.get("access")
11673
11674        if table is None:
11675            table = self.get_table_variants()
11676
11677        if fields_to_rename is not None and access not in ["RO"]:
11678
11679            log.info("Rename or remove fields...")
11680
11681            # Header
11682            header = self.get_header()
11683
11684            for field_to_rename, field_renamed in fields_to_rename.items():
11685
11686                if field_to_rename in header.infos:
11687
11688                    # Rename header
11689                    if field_renamed is not None:
11690                        header.infos[field_renamed] = vcf.parser._Info(
11691                            field_renamed,
11692                            header.infos[field_to_rename].num,
11693                            header.infos[field_to_rename].type,
11694                            header.infos[field_to_rename].desc,
11695                            header.infos[field_to_rename].source,
11696                            header.infos[field_to_rename].version,
11697                            header.infos[field_to_rename].type_code,
11698                        )
11699                    del header.infos[field_to_rename]
11700
11701                    # Rename INFO patterns
11702                    field_pattern = rf'(^|;)({field_to_rename})=([^;]*)'
11703                    if field_renamed is not None:
11704                        field_renamed_pattern = rf'\1{field_renamed}=\3'
11705                    else:
11706                        field_renamed_pattern = ''
11707
11708                    # Rename INFO
11709                    query = f"""
11710                        UPDATE {table}
11711                        SET
11712                            INFO = regexp_replace(INFO, '{field_pattern}', '{field_renamed_pattern}', 'g')
11713                    """
11714                    self.execute_query(query=query)
11715
11716                    # Return
11717                    fields_renamed[field_to_rename] = field_renamed
11718
11719                    # Log
11720                    if field_renamed is not None:
11721                        log.info(f"Rename or remove fields: field '{field_to_rename}' renamed to '{field_renamed}'")
11722                    else:
11723                        log.info(f"Rename or remove fields: field '{field_to_rename}' removed")
11724
11725        return fields_renamed
11726
11727    def calculation_rename_info_fields(
11728        self,
11729        fields_to_rename: dict = None,
11730        table: str = None,
11731        operation_name: str = "RENAME_INFO_FIELDS",
11732    ) -> None:
11733        """
11734        The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates
11735        fields to rename and table if provided, and then calls another function to rename the fields.
11736
11737        :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be
11738        renamed in a table. Each key-value pair in the dictionary represents the original field name as
11739        the key and the new field name as the value
11740        :type fields_to_rename: dict
11741        :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to
11742        specify the name of the table for which the fields are to be renamed. It is a string type
11743        parameter
11744        :type table: str
11745        :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields`
11746        method is a string that specifies the name of the operation being performed. In this context, it
11747        is used as a default value for the operation name if not explicitly provided when calling the
11748        function, defaults to RENAME_INFO_FIELDS
11749        :type operation_name: str (optional)
11750        """
11751
11752        # Param
11753        param = self.get_param()
11754
11755        # Get param fields to rename
11756        param_fields_to_rename = (
11757            param.get("calculation", {})
11758            .get("calculations", {})
11759            .get(operation_name, {})
11760            .get("fields_to_rename", None)
11761        )
11762
11763        # Get param table
11764        param_table = (
11765            param.get("calculation", {})
11766            .get("calculations", {})
11767            .get(operation_name, {})
11768            .get("table", None)
11769        )
11770
11771        # Init fields_to_rename
11772        if fields_to_rename is None:
11773            fields_to_rename = param_fields_to_rename
11774
11775        # Init table
11776        if table is None:
11777            table = param_table
11778
11779        renamed_fields = self.rename_info_fields(
11780            fields_to_rename=fields_to_rename, table=table
11781        )
11782
11783        log.debug(f"renamed_fields:{renamed_fields}")
Variants( conn=None, input: str = None, output: str = None, config: dict = {}, param: dict = {}, load: bool = False)
38    def __init__(
39        self,
40        conn=None,
41        input: str = None,
42        output: str = None,
43        config: dict = {},
44        param: dict = {},
45        load: bool = False,
46    ) -> None:
47        """
48        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
49        header
50
51        :param conn: the connection to the database
52        :param input: the input file
53        :param output: the output file
54        :param config: a dictionary containing the configuration of the model
55        :param param: a dictionary containing the parameters of the model
56        """
57
58        # Init variables
59        self.init_variables()
60
61        # Input
62        self.set_input(input)
63
64        # Config
65        self.set_config(config)
66
67        # Param
68        self.set_param(param)
69
70        # Output
71        self.set_output(output)
72
73        # connexion
74        self.set_connexion(conn)
75
76        # Header
77        self.set_header()
78
79        # Samples
80        self.set_samples()
81
82        # Load data
83        if load:
84            self.load_data()

The function __init__ initializes the variables, sets the input, output, config, param, connexion and header

Parameters
  • conn: the connection to the database
  • input: the input file
  • output: the output file
  • config: a dictionary containing the configuration of the model
  • param: a dictionary containing the parameters of the model
def set_samples(self, samples: list = None) -> list:
 86    def set_samples(self, samples: list = None) -> list:
 87        """
 88        The function `set_samples` sets the samples attribute of an object to a provided list or
 89        retrieves it from a parameter dictionary.
 90
 91        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
 92        input and sets the `samples` attribute of the class to the provided list. If no samples are
 93        provided, it tries to get the samples from the class's parameters using the `get_param` method
 94        :type samples: list
 95        :return: The `samples` list is being returned.
 96        """
 97
 98        if not samples:
 99            samples = self.get_param().get("samples", {}).get("list", None)
100
101        self.samples = samples
102
103        return samples

The function set_samples sets the samples attribute of an object to a provided list or retrieves it from a parameter dictionary.

Parameters
  • samples: The set_samples method is a method of a class that takes a list of samples as input and sets the samples attribute of the class to the provided list. If no samples are provided, it tries to get the samples from the class's parameters using the get_param method
Returns

The samples list is being returned.

def get_samples(self) -> list:
105    def get_samples(self) -> list:
106        """
107        This function returns a list of samples.
108        :return: The `get_samples` method is returning the `samples` attribute of the object.
109        """
110
111        return self.samples

This function returns a list of samples.

Returns

The get_samples method is returning the samples attribute of the object.

def get_samples_check(self) -> bool:
113    def get_samples_check(self) -> bool:
114        """
115        This function returns the value of the "check" key within the "samples" dictionary retrieved
116        from the parameters.
117        :return: The method `get_samples_check` is returning the value of the key "check" inside the
118        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
119        method. If the key "check" is not found, it will return `False`.
120        """
121
122        return self.get_param().get("samples", {}).get("check", True)

This function returns the value of the "check" key within the "samples" dictionary retrieved from the parameters.

Returns

The method get_samples_check is returning the value of the key "check" inside the "samples" dictionary, which is nested inside the dictionary returned by the get_param() method. If the key "check" is not found, it will return False.

def set_input(self, input: str = None) -> None:
124    def set_input(self, input: str = None) -> None:
125        """
126        The function `set_input` takes a file name as input, extracts the name and extension, and sets
127        attributes in the class accordingly.
128
129        :param input: The `set_input` method in the provided code snippet is used to set attributes
130        related to the input file. Here's a breakdown of the parameters and their usage in the method:
131        :type input: str
132        """
133
134        if input and not isinstance(input, str):
135            try:
136                self.input = input.name
137            except:
138                log.error(f"Input file '{input} in bad format")
139                raise ValueError(f"Input file '{input} in bad format")
140        else:
141            self.input = input
142
143        # Input format
144        if input:
145            input_name, input_extension = os.path.splitext(self.input)
146            self.input_name = input_name
147            self.input_extension = input_extension
148            self.input_format = self.input_extension.replace(".", "")

The function set_input takes a file name as input, extracts the name and extension, and sets attributes in the class accordingly.

Parameters
  • input: The set_input method in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
def set_config(self, config: dict) -> None:
150    def set_config(self, config: dict) -> None:
151        """
152        The set_config function takes a config object and assigns it as the configuration object for the
153        class.
154
155        :param config: The `config` parameter in the `set_config` function is a dictionary object that
156        contains configuration settings for the class. When you call the `set_config` function with a
157        dictionary object as the argument, it will set that dictionary as the configuration object for
158        the class
159        :type config: dict
160        """
161
162        self.config = config

The set_config function takes a config object and assigns it as the configuration object for the class.

Parameters
  • config: The config parameter in the set_config function is a dictionary object that contains configuration settings for the class. When you call the set_config function with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
def set_param(self, param: dict) -> None:
164    def set_param(self, param: dict) -> None:
165        """
166        This function sets a parameter object for the class based on the input dictionary.
167
168        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
169        as the `param` attribute of the class instance
170        :type param: dict
171        """
172
173        self.param = param

This function sets a parameter object for the class based on the input dictionary.

Parameters
  • param: The set_param method you provided takes a dictionary object as input and sets it as the param attribute of the class instance
def init_variables(self) -> None:
175    def init_variables(self) -> None:
176        """
177        This function initializes the variables that will be used in the rest of the class
178        """
179
180        self.prefix = "howard"
181        self.table_variants = "variants"
182        self.dataframe = None
183
184        self.comparison_map = {
185            "gt": ">",
186            "gte": ">=",
187            "lt": "<",
188            "lte": "<=",
189            "equals": "=",
190            "contains": "SIMILAR TO",
191        }
192
193        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
194
195        self.code_type_map_to_sql = {
196            "Integer": "INTEGER",
197            "String": "VARCHAR",
198            "Float": "FLOAT",
199            "Flag": "VARCHAR",
200        }
201
202        self.index_additionnal_fields = []

This function initializes the variables that will be used in the rest of the class

def get_indexing(self) -> bool:
204    def get_indexing(self) -> bool:
205        """
206        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
207        returns False.
208        :return: The value of the indexing parameter.
209        """
210
211        return self.get_param().get("indexing", False)

It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.

Returns

The value of the indexing parameter.

def get_connexion_config(self) -> dict:
213    def get_connexion_config(self) -> dict:
214        """
215        The function `get_connexion_config` returns a dictionary containing the configuration for a
216        connection, including the number of threads and memory limit.
217        :return: a dictionary containing the configuration for the Connexion library.
218        """
219
220        # config
221        config = self.get_config()
222
223        # Connexion config
224        connexion_config = {}
225        threads = self.get_threads()
226
227        # Threads
228        if threads:
229            connexion_config["threads"] = threads
230
231        # Memory
232        # if config.get("memory", None):
233        #     connexion_config["memory_limit"] = config.get("memory")
234        if self.get_memory():
235            connexion_config["memory_limit"] = self.get_memory()
236
237        # Temporary directory
238        if config.get("tmp", None):
239            connexion_config["temp_directory"] = config.get("tmp")
240
241        # Access
242        if config.get("access", None):
243            access = config.get("access")
244            if access in ["RO"]:
245                access = "READ_ONLY"
246            elif access in ["RW"]:
247                access = "READ_WRITE"
248            connexion_db = self.get_connexion_db()
249            if connexion_db in ":memory:":
250                access = "READ_WRITE"
251            connexion_config["access_mode"] = access
252
253        return connexion_config

The function get_connexion_config returns a dictionary containing the configuration for a connection, including the number of threads and memory limit.

Returns

a dictionary containing the configuration for the Connexion library.

def get_duckdb_settings(self) -> dict:
255    def get_duckdb_settings(self) -> dict:
256        """
257        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
258        string.
259        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
260        """
261
262        # config
263        config = self.get_config()
264
265        # duckdb settings
266        duckdb_settings_dict = {}
267        if config.get("duckdb_settings", None):
268            duckdb_settings = config.get("duckdb_settings")
269            duckdb_settings = full_path(duckdb_settings)
270            # duckdb setting is a file
271            if os.path.exists(duckdb_settings):
272                with open(duckdb_settings) as json_file:
273                    duckdb_settings_dict = yaml.safe_load(json_file)
274            # duckdb settings is a string
275            else:
276                duckdb_settings_dict = json.loads(duckdb_settings)
277
278        return duckdb_settings_dict

The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a string.

Returns

The function get_duckdb_settings returns a dictionary object duckdb_settings_dict.

def set_connexion_db(self) -> str:
280    def set_connexion_db(self) -> str:
281        """
282        The function `set_connexion_db` returns the appropriate database connection string based on the
283        input format and connection type.
284        :return: the value of the variable `connexion_db`.
285        """
286
287        # Default connexion db
288        default_connexion_db = ":memory:"
289
290        # Find connexion db
291        if self.get_input_format() in ["db", "duckdb"]:
292            connexion_db = self.get_input()
293        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
294            connexion_db = default_connexion_db
295        elif self.get_connexion_type() in ["tmpfile"]:
296            tmp_name = tempfile.mkdtemp(
297                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
298            )
299            connexion_db = f"{tmp_name}/tmp.db"
300        elif self.get_connexion_type() != "":
301            connexion_db = self.get_connexion_type()
302        else:
303            connexion_db = default_connexion_db
304
305        # Set connexion db
306        self.connexion_db = connexion_db
307
308        return connexion_db

The function set_connexion_db returns the appropriate database connection string based on the input format and connection type.

Returns

the value of the variable connexion_db.

def set_connexion(self, conn) -> None:
310    def set_connexion(self, conn) -> None:
311        """
312        The function `set_connexion` creates a connection to a database, with options for different
313        database formats and settings.
314
315        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
316        database. If a connection is not provided, a new connection to an in-memory database is created.
317        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
318        sqlite
319        """
320
321        # Connexion db
322        connexion_db = self.set_connexion_db()
323
324        # Connexion config
325        connexion_config = self.get_connexion_config()
326
327        # Connexion format
328        connexion_format = self.get_config().get("connexion_format", "duckdb")
329        # Set connexion format
330        self.connexion_format = connexion_format
331
332        # Connexion
333        if not conn:
334            if connexion_format in ["duckdb"]:
335                conn = duckdb.connect(connexion_db, config=connexion_config)
336                # duckDB settings
337                duckdb_settings = self.get_duckdb_settings()
338                if duckdb_settings:
339                    for setting in duckdb_settings:
340                        setting_value = duckdb_settings.get(setting)
341                        if isinstance(setting_value, str):
342                            setting_value = f"'{setting_value}'"
343                        conn.execute(f"PRAGMA {setting}={setting_value};")
344            elif connexion_format in ["sqlite"]:
345                conn = sqlite3.connect(connexion_db)
346
347        # Set connexion
348        self.conn = conn
349
350        # Log
351        log.debug(f"connexion_format: {connexion_format}")
352        log.debug(f"connexion_db: {connexion_db}")
353        log.debug(f"connexion config: {connexion_config}")
354        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")

The function set_connexion creates a connection to a database, with options for different database formats and settings.

Parameters
  • conn: The conn parameter in the set_connexion method is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
def set_output(self, output: str = None) -> None:
356    def set_output(self, output: str = None) -> None:
357        """
358        The `set_output` function in Python sets the output file based on the input or a specified key
359        in the config file, extracting the output name, extension, and format.
360
361        :param output: The `output` parameter in the `set_output` method is used to specify the name of
362        the output file. If the config file has an 'output' key, the method sets the output to the value
363        of that key. If no output is provided, it sets the output to `None`
364        :type output: str
365        """
366
367        if output and not isinstance(output, str):
368            self.output = output.name
369        else:
370            self.output = output
371
372        # Output format
373        if self.output:
374            output_name, output_extension = os.path.splitext(self.output)
375            self.output_name = output_name
376            self.output_extension = output_extension
377            self.output_format = self.output_extension.replace(".", "")
378        else:
379            self.output_name = None
380            self.output_extension = None
381            self.output_format = None

The set_output function in Python sets the output file based on the input or a specified key in the config file, extracting the output name, extension, and format.

Parameters
  • output: The output parameter in the set_output method is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output to None
def set_header(self) -> None:
383    def set_header(self) -> None:
384        """
385        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
386        """
387
388        input_file = self.get_input()
389        default_header_list = [
390            "##fileformat=VCFv4.2",
391            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
392        ]
393
394        # Full path
395        input_file = full_path(input_file)
396
397        if input_file:
398
399            input_format = self.get_input_format()
400            input_compressed = self.get_input_compressed()
401            config = self.get_config()
402            header_list = default_header_list
403            if input_format in [
404                "vcf",
405                "hdr",
406                "tsv",
407                "csv",
408                "psv",
409                "parquet",
410                "db",
411                "duckdb",
412            ]:
413                # header provided in param
414                if config.get("header_file", None):
415                    with open(config.get("header_file"), "rt") as f:
416                        header_list = self.read_vcf_header(f)
417                # within a vcf file format (header within input file itsself)
418                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
419                    # within a compressed vcf file format (.vcf.gz)
420                    if input_compressed:
421                        with bgzf.open(input_file, "rt") as f:
422                            header_list = self.read_vcf_header(f)
423                    # within an uncompressed vcf file format (.vcf)
424                    else:
425                        with open(input_file, "rt") as f:
426                            header_list = self.read_vcf_header(f)
427                # header provided in default external file .hdr
428                elif os.path.exists((input_file + ".hdr")):
429                    with open(input_file + ".hdr", "rt") as f:
430                        header_list = self.read_vcf_header(f)
431                else:
432                    try:  # Try to get header info fields and file columns
433
434                        with tempfile.TemporaryDirectory() as tmpdir:
435
436                            # Create database
437                            db_for_header = Database(database=input_file)
438
439                            # Get header columns for infos fields
440                            db_header_from_columns = (
441                                db_for_header.get_header_from_columns()
442                            )
443
444                            # Get real columns in the file
445                            db_header_columns = db_for_header.get_columns()
446
447                            # Write header file
448                            header_file_tmp = os.path.join(tmpdir, "header")
449                            f = open(header_file_tmp, "w")
450                            vcf.Writer(f, db_header_from_columns)
451                            f.close()
452
453                            # Replace #CHROM line with rel columns
454                            header_list = db_for_header.read_header_file(
455                                header_file=header_file_tmp
456                            )
457                            header_list[-1] = "\t".join(db_header_columns)
458
459                    except:
460
461                        log.warning(
462                            f"No header for file {input_file}. Set as default VCF header"
463                        )
464                        header_list = default_header_list
465
466            else:  # try for unknown format ?
467
468                log.error(f"Input file format '{input_format}' not available")
469                raise ValueError(f"Input file format '{input_format}' not available")
470
471            if not header_list:
472                header_list = default_header_list
473
474            # header as list
475            self.header_list = header_list
476
477            # header as VCF object
478            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
479
480        else:
481
482            self.header_list = None
483            self.header_vcf = None

It reads the header of a VCF file and stores it as a list of strings and as a VCF object

def get_query_to_df(self, query: str = '', limit: int = None) -> pandas.core.frame.DataFrame:
485    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
486        """
487        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
488        DataFrame based on the connection format.
489
490        :param query: The `query` parameter in the `get_query_to_df` function is a string that
491        represents the SQL query you want to execute. This query will be used to fetch data from a
492        database and convert it into a pandas DataFrame
493        :type query: str
494        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
495        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
496        function will only fetch up to that number of rows from the database query result. If no limit
497        is specified,
498        :type limit: int
499        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
500        """
501
502        # Connexion format
503        connexion_format = self.get_connexion_format()
504
505        # Limit in query
506        if limit:
507            pd.set_option("display.max_rows", limit)
508            if connexion_format in ["duckdb"]:
509                df = (
510                    self.conn.execute(query)
511                    .fetch_record_batch(limit)
512                    .read_next_batch()
513                    .to_pandas()
514                )
515            elif connexion_format in ["sqlite"]:
516                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
517
518        # Full query
519        else:
520            if connexion_format in ["duckdb"]:
521                df = self.conn.execute(query).df()
522            elif connexion_format in ["sqlite"]:
523                df = pd.read_sql_query(query, self.conn)
524
525        return df

The get_query_to_df function takes a query as a string and returns the result as a pandas DataFrame based on the connection format.

Parameters
  • query: The query parameter in the get_query_to_df function is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame
  • limit: The limit parameter in the get_query_to_df function is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns

A pandas DataFrame is being returned by the get_query_to_df function.

def get_overview(self) -> None:
527    def get_overview(self) -> None:
528        """
529        The function prints the input, output, config, and dataframe of the current object
530        """
531        table_variants_from = self.get_table_variants(clause="from")
532        sql_columns = self.get_header_columns_as_sql()
533        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
534        df = self.get_query_to_df(sql_query_export)
535        log.info(
536            "Input:  "
537            + str(self.get_input())
538            + " ["
539            + str(str(self.get_input_format()))
540            + "]"
541        )
542        log.info(
543            "Output: "
544            + str(self.get_output())
545            + " ["
546            + str(str(self.get_output_format()))
547            + "]"
548        )
549        log.info("Config: ")
550        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
551            "\n"
552        ):
553            log.info("\t" + str(d))
554        log.info("Param: ")
555        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
556            "\n"
557        ):
558            log.info("\t" + str(d))
559        log.info("Sample list: " + str(self.get_header_sample_list()))
560        log.info("Dataframe: ")
561        for d in str(df).split("\n"):
562            log.info("\t" + str(d))
563
564        # garbage collector
565        del df
566        gc.collect()
567
568        return None

The function prints the input, output, config, and dataframe of the current object

def get_stats(self) -> dict:
570    def get_stats(self) -> dict:
571        """
572        The `get_stats` function calculates and returns various statistics of the current object,
573        including information about the input file, variants, samples, header fields, quality, and
574        SNVs/InDels.
575        :return: a dictionary containing various statistics of the current object. The dictionary has
576        the following structure:
577        """
578
579        # Log
580        log.info(f"Stats Calculation...")
581
582        # table varaints
583        table_variants_from = self.get_table_variants()
584
585        # stats dict
586        stats = {"Infos": {}}
587
588        ### File
589        input_file = self.get_input()
590        stats["Infos"]["Input file"] = input_file
591
592        # Header
593        header_infos = self.get_header().infos
594        header_formats = self.get_header().formats
595        header_infos_list = list(header_infos)
596        header_formats_list = list(header_formats)
597
598        ### Variants
599
600        stats["Variants"] = {}
601
602        # Variants by chr
603        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
604        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
605        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
606            by=["CHROM"], kind="quicksort"
607        )
608
609        # Total number of variants
610        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
611
612        # Calculate percentage
613        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
614            lambda x: (x / nb_of_variants)
615        )
616
617        stats["Variants"]["Number of variants by chromosome"] = (
618            nb_of_variants_by_chrom.to_dict(orient="index")
619        )
620
621        stats["Infos"]["Number of variants"] = int(nb_of_variants)
622
623        ### Samples
624
625        # Init
626        samples = {}
627        nb_of_samples = 0
628
629        # Check Samples
630        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
631            log.debug(f"Check samples...")
632            for sample in self.get_header_sample_list():
633                sql_query_samples = f"""
634                    SELECT  '{sample}' as sample,
635                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
636                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
637                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
638                    FROM {table_variants_from}
639                    WHERE (
640                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
641                        AND
642                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
643                      )
644                    GROUP BY genotype
645                    """
646                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
647                sample_genotype_count = sql_query_genotype_df["count"].sum()
648                if len(sql_query_genotype_df):
649                    nb_of_samples += 1
650                    samples[f"{sample} - {sample_genotype_count} variants"] = (
651                        sql_query_genotype_df.to_dict(orient="index")
652                    )
653
654            stats["Samples"] = samples
655            stats["Infos"]["Number of samples"] = nb_of_samples
656
657        # #
658        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
659        #     stats["Infos"]["Number of samples"] = nb_of_samples
660        # elif nb_of_samples:
661        #     stats["Infos"]["Number of samples"] = "not a VCF format"
662
663        ### INFO and FORMAT fields
664        header_types_df = {}
665        header_types_list = {
666            "List of INFO fields": header_infos,
667            "List of FORMAT fields": header_formats,
668        }
669        i = 0
670        for header_type in header_types_list:
671
672            header_type_infos = header_types_list.get(header_type)
673            header_infos_dict = {}
674
675            for info in header_type_infos:
676
677                i += 1
678                header_infos_dict[i] = {}
679
680                # ID
681                header_infos_dict[i]["id"] = info
682
683                # num
684                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
685                if header_type_infos[info].num in genotype_map.keys():
686                    header_infos_dict[i]["Number"] = genotype_map.get(
687                        header_type_infos[info].num
688                    )
689                else:
690                    header_infos_dict[i]["Number"] = header_type_infos[info].num
691
692                # type
693                if header_type_infos[info].type:
694                    header_infos_dict[i]["Type"] = header_type_infos[info].type
695                else:
696                    header_infos_dict[i]["Type"] = "."
697
698                # desc
699                if header_type_infos[info].desc != None:
700                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
701                else:
702                    header_infos_dict[i]["Description"] = ""
703
704            if len(header_infos_dict):
705                header_types_df[header_type] = pd.DataFrame.from_dict(
706                    header_infos_dict, orient="index"
707                ).to_dict(orient="index")
708
709        # Stats
710        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
711        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
712        stats["Header"] = header_types_df
713
714        ### QUAL
715        if "QUAL" in self.get_header_columns():
716            sql_query_qual = f"""
717                    SELECT
718                        avg(CAST(QUAL AS INTEGER)) AS Average,
719                        min(CAST(QUAL AS INTEGER)) AS Minimum,
720                        max(CAST(QUAL AS INTEGER)) AS Maximum,
721                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
722                        median(CAST(QUAL AS INTEGER)) AS Median,
723                        variance(CAST(QUAL AS INTEGER)) AS Variance
724                    FROM {table_variants_from}
725                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
726                    """
727
728            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
729            stats["Quality"] = {"Stats": qual}
730
731        ### SNV and InDel
732
733        sql_query_snv = f"""
734            
735            SELECT Type, count FROM (
736
737                    SELECT
738                        'Total' AS Type,
739                        count(*) AS count
740                    FROM {table_variants_from}
741
742                    UNION
743
744                    SELECT
745                        'MNV' AS Type,
746                        count(*) AS count
747                    FROM {table_variants_from}
748                    WHERE len(REF) > 1 AND len(ALT) > 1
749                    AND len(REF) = len(ALT)
750
751                    UNION
752
753                    SELECT
754                        'InDel' AS Type,
755                        count(*) AS count
756                    FROM {table_variants_from}
757                    WHERE len(REF) > 1 OR len(ALT) > 1
758                    AND len(REF) != len(ALT)
759                    
760                    UNION
761
762                    SELECT
763                        'SNV' AS Type,
764                        count(*) AS count
765                    FROM {table_variants_from}
766                    WHERE len(REF) = 1 AND len(ALT) = 1
767
768                )
769
770            ORDER BY count DESC
771
772                """
773        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
774
775        sql_query_snv_substitution = f"""
776                SELECT
777                    concat(REF, '>', ALT) AS 'Substitution',
778                    count(*) AS count
779                FROM {table_variants_from}
780                WHERE len(REF) = 1 AND len(ALT) = 1
781                GROUP BY REF, ALT
782                ORDER BY count(*) DESC
783                """
784        snv_substitution = (
785            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
786        )
787        stats["Variants"]["Counts"] = snv_indel
788        stats["Variants"]["Substitutions"] = snv_substitution
789
790        return stats

The get_stats function calculates and returns various statistics of the current object, including information about the input file, variants, samples, header fields, quality, and SNVs/InDels.

Returns

a dictionary containing various statistics of the current object. The dictionary has the following structure:

def stats_to_file(self, file: str = None) -> str:
792    def stats_to_file(self, file: str = None) -> str:
793        """
794        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
795        into a JSON object, and writes the JSON object to the specified file.
796
797        :param file: The `file` parameter is a string that represents the file path where the JSON data
798        will be written
799        :type file: str
800        :return: the name of the file that was written to.
801        """
802
803        # Get stats
804        stats = self.get_stats()
805
806        # Serializing json
807        json_object = json.dumps(stats, indent=4)
808
809        # Writing to sample.json
810        with open(file, "w") as outfile:
811            outfile.write(json_object)
812
813        return file

The function stats_to_file takes a file name as input, retrieves statistics, serializes them into a JSON object, and writes the JSON object to the specified file.

Parameters
  • file: The file parameter is a string that represents the file path where the JSON data will be written
Returns

the name of the file that was written to.

def print_stats(self, output_file: str = None, json_file: str = None) -> None:
815    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
816        """
817        The `print_stats` function generates a markdown file and prints the statistics contained in a
818        JSON file in a formatted manner.
819
820        :param output_file: The `output_file` parameter is a string that specifies the path and filename
821        of the output file where the stats will be printed in Markdown format. If no `output_file` is
822        provided, a temporary directory will be created and the stats will be saved in a file named
823        "stats.md" within that
824        :type output_file: str
825        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
826        file where the statistics will be saved. If no value is provided, a temporary directory will be
827        created and a default file name "stats.json" will be used
828        :type json_file: str
829        :return: The function `print_stats` does not return any value. It has a return type annotation
830        of `None`.
831        """
832
833        # Full path
834        output_file = full_path(output_file)
835        json_file = full_path(json_file)
836
837        with tempfile.TemporaryDirectory() as tmpdir:
838
839            # Files
840            if not output_file:
841                output_file = os.path.join(tmpdir, "stats.md")
842            if not json_file:
843                json_file = os.path.join(tmpdir, "stats.json")
844
845            # Create folders
846            if not os.path.exists(os.path.dirname(output_file)):
847                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
848            if not os.path.exists(os.path.dirname(json_file)):
849                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
850
851            # Create stats JSON file
852            stats_file = self.stats_to_file(file=json_file)
853
854            # Print stats file
855            with open(stats_file) as f:
856                stats = yaml.safe_load(f)
857
858            # Output
859            output_title = []
860            output_index = []
861            output = []
862
863            # Title
864            output_title.append("# HOWARD Stats")
865
866            # Index
867            output_index.append("## Index")
868
869            # Process sections
870            for section in stats:
871                infos = stats.get(section)
872                section_link = "#" + section.lower().replace(" ", "-")
873                output.append(f"## {section}")
874                output_index.append(f"- [{section}]({section_link})")
875
876                if len(infos):
877                    for info in infos:
878                        try:
879                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
880                            is_df = True
881                        except:
882                            try:
883                                df = pd.DataFrame.from_dict(
884                                    json.loads((infos.get(info))), orient="index"
885                                )
886                                is_df = True
887                            except:
888                                is_df = False
889                        if is_df:
890                            output.append(f"### {info}")
891                            info_link = "#" + info.lower().replace(" ", "-")
892                            output_index.append(f"   - [{info}]({info_link})")
893                            output.append(f"{df.to_markdown(index=False)}")
894                        else:
895                            output.append(f"- {info}: {infos.get(info)}")
896                else:
897                    output.append(f"NA")
898
899            # Write stats in markdown file
900            with open(output_file, "w") as fp:
901                for item in output_title:
902                    fp.write("%s\n" % item)
903                for item in output_index:
904                    fp.write("%s\n" % item)
905                for item in output:
906                    fp.write("%s\n" % item)
907
908            # Output stats in markdown
909            print("")
910            print("\n\n".join(output_title))
911            print("")
912            print("\n\n".join(output))
913            print("")
914
915        return None

The print_stats function generates a markdown file and prints the statistics contained in a JSON file in a formatted manner.

Parameters
  • output_file: The output_file parameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If no output_file is provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that
  • json_file: The json_file parameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns

The function print_stats does not return any value. It has a return type annotation of None.

def get_input(self) -> str:
917    def get_input(self) -> str:
918        """
919        It returns the value of the input variable.
920        :return: The input is being returned.
921        """
922        return self.input

It returns the value of the input variable.

Returns

The input is being returned.

def get_input_format(self, input_file: str = None) -> str:
924    def get_input_format(self, input_file: str = None) -> str:
925        """
926        This function returns the format of the input variable, either from the provided input file or
927        by prompting for input.
928
929        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
930        represents the file path of the input file. If no `input_file` is provided when calling the
931        method, it will default to `None`
932        :type input_file: str
933        :return: The format of the input variable is being returned.
934        """
935
936        if not input_file:
937            input_file = self.get_input()
938        input_format = get_file_format(input_file)
939        return input_format

This function returns the format of the input variable, either from the provided input file or by prompting for input.

Parameters
  • input_file: The input_file parameter in the get_input_format method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None
Returns

The format of the input variable is being returned.

def get_input_compressed(self, input_file: str = None) -> str:
941    def get_input_compressed(self, input_file: str = None) -> str:
942        """
943        The function `get_input_compressed` returns the format of the input variable after compressing
944        it.
945
946        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
947        that represents the file path of the input file. If no `input_file` is provided when calling the
948        method, it will default to `None` and the method will then call `self.get_input()` to
949        :type input_file: str
950        :return: The function `get_input_compressed` returns the compressed format of the input
951        variable.
952        """
953
954        if not input_file:
955            input_file = self.get_input()
956        input_compressed = get_file_compressed(input_file)
957        return input_compressed

The function get_input_compressed returns the format of the input variable after compressing it.

Parameters
  • input_file: The input_file parameter in the get_input_compressed method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None and the method will then call self.get_input() to
Returns

The function get_input_compressed returns the compressed format of the input variable.

def get_output(self) -> str:
959    def get_output(self) -> str:
960        """
961        It returns the output of the neuron.
962        :return: The output of the neural network.
963        """
964
965        return self.output

It returns the output of the neuron.

Returns

The output of the neural network.

def get_output_format(self, output_file: str = None) -> str:
967    def get_output_format(self, output_file: str = None) -> str:
968        """
969        The function `get_output_format` returns the format of the input variable or the output file if
970        provided.
971
972        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
973        that represents the file path of the output file. If no `output_file` is provided when calling
974        the method, it will default to the output obtained from the `get_output` method of the class
975        instance. The
976        :type output_file: str
977        :return: The format of the input variable is being returned.
978        """
979
980        if not output_file:
981            output_file = self.get_output()
982        output_format = get_file_format(output_file)
983
984        return output_format

The function get_output_format returns the format of the input variable or the output file if provided.

Parameters
  • output_file: The output_file parameter in the get_output_format method is a string that represents the file path of the output file. If no output_file is provided when calling the method, it will default to the output obtained from the get_output method of the class instance. The
Returns

The format of the input variable is being returned.

def get_config(self) -> dict:
986    def get_config(self) -> dict:
987        """
988        It returns the config
989        :return: The config variable is being returned.
990        """
991        return self.config

It returns the config

Returns

The config variable is being returned.

def get_param(self) -> dict:
993    def get_param(self) -> dict:
994        """
995        It returns the param
996        :return: The param variable is being returned.
997        """
998        return self.param

It returns the param

Returns

The param variable is being returned.

def get_connexion_db(self) -> str:
1000    def get_connexion_db(self) -> str:
1001        """
1002        It returns the connexion_db attribute of the object
1003        :return: The connexion_db is being returned.
1004        """
1005        return self.connexion_db

It returns the connexion_db attribute of the object

Returns

The connexion_db is being returned.

def get_prefix(self) -> str:
1007    def get_prefix(self) -> str:
1008        """
1009        It returns the prefix of the object.
1010        :return: The prefix is being returned.
1011        """
1012        return self.prefix

It returns the prefix of the object.

Returns

The prefix is being returned.

def get_table_variants(self, clause: str = 'select') -> str:
1014    def get_table_variants(self, clause: str = "select") -> str:
1015        """
1016        This function returns the table_variants attribute of the object
1017
1018        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
1019        defaults to select (optional)
1020        :return: The table_variants attribute of the object.
1021        """
1022
1023        # Access
1024        access = self.get_config().get("access", None)
1025
1026        # Clauses "select", "where", "update"
1027        if clause in ["select", "where", "update"]:
1028            table_variants = self.table_variants
1029        # Clause "from"
1030        elif clause in ["from"]:
1031            # For Read Only
1032            if self.get_input_format() in ["parquet"] and access in ["RO"]:
1033                input_file = self.get_input()
1034                table_variants = f"'{input_file}' as variants"
1035            # For Read Write
1036            else:
1037                table_variants = f"{self.table_variants} as variants"
1038        else:
1039            table_variants = self.table_variants
1040        return table_variants

This function returns the table_variants attribute of the object

Parameters
  • clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns

The table_variants attribute of the object.

def get_tmp_dir(self) -> str:
1042    def get_tmp_dir(self) -> str:
1043        """
1044        The function `get_tmp_dir` returns the temporary directory path based on configuration
1045        parameters or a default path.
1046        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
1047        configuration, parameters, and a default value of "/tmp".
1048        """
1049
1050        return get_tmp(
1051            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
1052        )

The function get_tmp_dir returns the temporary directory path based on configuration parameters or a default path.

Returns

The get_tmp_dir method is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".

def get_connexion_type(self) -> str:
1054    def get_connexion_type(self) -> str:
1055        """
1056        If the connexion type is not in the list of allowed connexion types, raise a ValueError
1057
1058        :return: The connexion type is being returned.
1059        """
1060        return self.get_config().get("connexion_type", "memory")

If the connexion type is not in the list of allowed connexion types, raise a ValueError

Returns

The connexion type is being returned.

def get_connexion(self):
1062    def get_connexion(self):
1063        """
1064        It returns the connection object
1065
1066        :return: The connection object.
1067        """
1068        return self.conn

It returns the connection object

Returns

The connection object.

def close_connexion(self) -> None:
1070    def close_connexion(self) -> None:
1071        """
1072        This function closes the connection to the database.
1073        :return: The connection is being closed.
1074        """
1075        return self.conn.close()

This function closes the connection to the database.

Returns

The connection is being closed.

def get_header(self, type: str = 'vcf'):
1077    def get_header(self, type: str = "vcf"):
1078        """
1079        This function returns the header of the VCF file as a list of strings
1080
1081        :param type: the type of header you want to get, defaults to vcf (optional)
1082        :return: The header of the vcf file.
1083        """
1084
1085        if self.header_vcf:
1086            if type == "vcf":
1087                return self.header_vcf
1088            elif type == "list":
1089                return self.header_list
1090        else:
1091            if type == "vcf":
1092                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
1093                return header
1094            elif type == "list":
1095                return vcf_required

This function returns the header of the VCF file as a list of strings

Parameters
  • type: the type of header you want to get, defaults to vcf (optional)
Returns

The header of the vcf file.

def get_header_infos_list(self) -> list:
1097    def get_header_infos_list(self) -> list:
1098        """
1099        This function retrieves a list of information fields from the header.
1100        :return: A list of information fields from the header.
1101        """
1102
1103        # Init
1104        infos_list = []
1105
1106        for field in self.get_header().infos:
1107            infos_list.append(field)
1108
1109        return infos_list

This function retrieves a list of information fields from the header.

Returns

A list of information fields from the header.

def get_header_length(self, file: str = None) -> int:
1111    def get_header_length(self, file: str = None) -> int:
1112        """
1113        The function `get_header_length` returns the length of the header list, excluding the #CHROM
1114        line.
1115
1116        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
1117        header file. If this argument is provided, the function will read the header from the specified
1118        file and return the length of the header list minus 1 (to exclude the #CHROM line)
1119        :type file: str
1120        :return: the length of the header list, excluding the #CHROM line.
1121        """
1122
1123        if file:
1124            return len(self.read_vcf_header_file(file=file)) - 1
1125        elif self.get_header(type="list"):
1126            return len(self.get_header(type="list")) - 1
1127        else:
1128            return 0

The function get_header_length returns the length of the header list, excluding the #CHROM line.

Parameters
  • file: The file parameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns

the length of the header list, excluding the #CHROM line.

def get_header_columns(self) -> str:
1130    def get_header_columns(self) -> str:
1131        """
1132        This function returns the header list of a VCF
1133
1134        :return: The length of the header list.
1135        """
1136        if self.get_header():
1137            return self.get_header(type="list")[-1]
1138        else:
1139            return ""

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_list(self) -> list:
1141    def get_header_columns_as_list(self) -> list:
1142        """
1143        This function returns the header list of a VCF
1144
1145        :return: The length of the header list.
1146        """
1147        if self.get_header():
1148            return self.get_header_columns().strip().split("\t")
1149        else:
1150            return []

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_sql(self) -> str:
1152    def get_header_columns_as_sql(self) -> str:
1153        """
1154        This function retruns header length (without #CHROM line)
1155
1156        :return: The length of the header list.
1157        """
1158        sql_column_list = []
1159        for col in self.get_header_columns_as_list():
1160            sql_column_list.append(f'"{col}"')
1161        return ",".join(sql_column_list)

This function retruns header length (without #CHROM line)

Returns

The length of the header list.

def get_header_sample_list( self, check: bool = False, samples: list = None, samples_force: bool = False) -> list:
1163    def get_header_sample_list(
1164        self, check: bool = False, samples: list = None, samples_force: bool = False
1165    ) -> list:
1166        """
1167        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
1168        checking and filtering based on input parameters.
1169
1170        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
1171        parameter that determines whether to check if the samples in the list are properly defined as
1172        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
1173        list is defined as a, defaults to False
1174        :type check: bool (optional)
1175        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
1176        allows you to specify a subset of samples from the header. If you provide a list of sample
1177        names, the function will check if each sample is defined in the header. If a sample is not found
1178        in the
1179        :type samples: list
1180        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
1181        a boolean parameter that determines whether to force the function to return the sample list
1182        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
1183        function will return the sample list without performing, defaults to False
1184        :type samples_force: bool (optional)
1185        :return: The function `get_header_sample_list` returns a list of samples based on the input
1186        parameters and conditions specified in the function.
1187        """
1188
1189        # Init
1190        samples_list = []
1191
1192        if samples is None:
1193            samples_list = self.header_vcf.samples
1194        else:
1195            samples_checked = []
1196            for sample in samples:
1197                if sample in self.header_vcf.samples:
1198                    samples_checked.append(sample)
1199                else:
1200                    log.warning(f"Sample '{sample}' not defined in header")
1201            samples_list = samples_checked
1202
1203            # Force sample list without checking if is_genotype_column
1204            if samples_force:
1205                log.warning(f"Samples {samples_list} not checked if genotypes")
1206                return samples_list
1207
1208        if check:
1209            samples_checked = []
1210            for sample in samples_list:
1211                if self.is_genotype_column(column=sample):
1212                    samples_checked.append(sample)
1213                else:
1214                    log.warning(
1215                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
1216                    )
1217            samples_list = samples_checked
1218
1219        # Return samples list
1220        return samples_list

The function get_header_sample_list returns a list of samples from a VCF header, with optional checking and filtering based on input parameters.

Parameters
  • check: The check parameter in the get_header_sample_list function is a boolean parameter that determines whether to check if the samples in the list are properly defined as genotype columns. If check is set to True, the function will verify if each sample in the list is defined as a, defaults to False
  • samples: The samples parameter in the get_header_sample_list function is a list that allows you to specify a subset of samples from the header. If you provide a list of sample names, the function will check if each sample is defined in the header. If a sample is not found in the
  • samples_force: The samples_force parameter in the get_header_sample_list function is a boolean parameter that determines whether to force the function to return the sample list without checking if the samples are genotype columns. If samples_force is set to True, the function will return the sample list without performing, defaults to False
Returns

The function get_header_sample_list returns a list of samples based on the input parameters and conditions specified in the function.

def is_genotype_column(self, column: str = None) -> bool:
1222    def is_genotype_column(self, column: str = None) -> bool:
1223        """
1224        This function checks if a given column is a genotype column in a database.
1225
1226        :param column: The `column` parameter in the `is_genotype_column` method is a string that
1227        represents the column name in a database table. This method checks if the specified column is a
1228        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
1229        method of
1230        :type column: str
1231        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
1232        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
1233        column name and returns the result. If the `column` parameter is None, it returns False.
1234        """
1235
1236        if column is not None:
1237            return Database(database=self.get_input()).is_genotype_column(column=column)
1238        else:
1239            return False

This function checks if a given column is a genotype column in a database.

Parameters
  • column: The column parameter in the is_genotype_column method is a string that represents the column name in a database table. This method checks if the specified column is a genotype column in the database. If a column name is provided, it calls the is_genotype_column method of
Returns

The is_genotype_column method is returning a boolean value. If the column parameter is not None, it calls the is_genotype_column method of the Database class with the specified column name and returns the result. If the column parameter is None, it returns False.

def get_verbose(self) -> bool:
1241    def get_verbose(self) -> bool:
1242        """
1243        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
1244        exist
1245
1246        :return: The value of the key "verbose" in the config dictionary.
1247        """
1248        return self.get_config().get("verbose", False)

It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist

Returns

The value of the key "verbose" in the config dictionary.

def get_connexion_format(self) -> str:
1250    def get_connexion_format(self) -> str:
1251        """
1252        It returns the connexion format of the object.
1253        :return: The connexion_format is being returned.
1254        """
1255        connexion_format = self.connexion_format
1256        if connexion_format not in ["duckdb", "sqlite"]:
1257            log.error(f"Unknown connexion format {connexion_format}")
1258            raise ValueError(f"Unknown connexion format {connexion_format}")
1259        else:
1260            return connexion_format

It returns the connexion format of the object.

Returns

The connexion_format is being returned.

def insert_file_to_table( self, file, columns: str, header_len: int = 0, sep: str = '\t', chunksize: int = 1000000) -> None:
1262    def insert_file_to_table(
1263        self,
1264        file,
1265        columns: str,
1266        header_len: int = 0,
1267        sep: str = "\t",
1268        chunksize: int = 1000000,
1269    ) -> None:
1270        """
1271        The function reads a file in chunks and inserts each chunk into a table based on the specified
1272        database format.
1273
1274        :param file: The `file` parameter is the file that you want to load into a table. It should be
1275        the path to the file on your system
1276        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
1277        should contain the names of the columns in the table where the data will be inserted. The column
1278        names should be separated by commas within the string. For example, if you have columns named
1279        "id", "name
1280        :type columns: str
1281        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
1282        the number of lines to skip at the beginning of the file before reading the actual data. This
1283        parameter allows you to skip any header information present in the file before processing the
1284        data, defaults to 0
1285        :type header_len: int (optional)
1286        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
1287        separator character that is used in the file being read. In this case, the default separator is
1288        set to `\t`, which represents a tab character. You can change this parameter to a different
1289        separator character if, defaults to \t
1290        :type sep: str (optional)
1291        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
1292        when processing the file in chunks. In the provided code snippet, the default value for
1293        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
1294        to 1000000
1295        :type chunksize: int (optional)
1296        """
1297
1298        # Config
1299        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
1300        connexion_format = self.get_connexion_format()
1301
1302        log.debug("chunksize: " + str(chunksize))
1303
1304        if chunksize:
1305            for chunk in pd.read_csv(
1306                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
1307            ):
1308                if connexion_format in ["duckdb"]:
1309                    sql_insert_into = (
1310                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
1311                    )
1312                    self.conn.execute(sql_insert_into)
1313                elif connexion_format in ["sqlite"]:
1314                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)

The function reads a file in chunks and inserts each chunk into a table based on the specified database format.

Parameters
  • file: The file parameter is the file that you want to load into a table. It should be the path to the file on your system
  • columns: The columns parameter in the insert_file_to_table function is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name
  • header_len: The header_len parameter in the insert_file_to_table function specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0
  • sep: The sep parameter in the insert_file_to_table function is used to specify the separator character that is used in the file being read. In this case, the default separator is set to , which represents a tab character. You can change this parameter to a different separator character if, defaults to
  • chunksize: The chunksize parameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value for chunksize is set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
def load_data( self, input_file: str = None, drop_variants_table: bool = False, sample_size: int = 20480) -> None:
1316    def load_data(
1317        self,
1318        input_file: str = None,
1319        drop_variants_table: bool = False,
1320        sample_size: int = 20480,
1321    ) -> None:
1322        """
1323        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
1324        table before loading the data and specify a sample size.
1325
1326        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
1327        table
1328        :type input_file: str
1329        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
1330        determines whether the variants table should be dropped before loading the data. If set to
1331        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
1332        not be dropped, defaults to False
1333        :type drop_variants_table: bool (optional)
1334        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
1335        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
1336        20480
1337        :type sample_size: int (optional)
1338        """
1339
1340        log.info("Loading...")
1341
1342        # change input file
1343        if input_file:
1344            self.set_input(input_file)
1345            self.set_header()
1346
1347        # drop variants table
1348        if drop_variants_table:
1349            self.drop_variants_table()
1350
1351        # get table variants
1352        table_variants = self.get_table_variants()
1353
1354        # Access
1355        access = self.get_config().get("access", None)
1356        log.debug(f"access: {access}")
1357
1358        # Input format and compress
1359        input_format = self.get_input_format()
1360        input_compressed = self.get_input_compressed()
1361        log.debug(f"input_format: {input_format}")
1362        log.debug(f"input_compressed: {input_compressed}")
1363
1364        # input_compressed_format
1365        if input_compressed:
1366            input_compressed_format = "gzip"
1367        else:
1368            input_compressed_format = "none"
1369        log.debug(f"input_compressed_format: {input_compressed_format}")
1370
1371        # Connexion format
1372        connexion_format = self.get_connexion_format()
1373
1374        # Sample size
1375        if not sample_size:
1376            sample_size = -1
1377        log.debug(f"sample_size: {sample_size}")
1378
1379        # Load data
1380        log.debug(f"Load Data from {input_format}")
1381
1382        # DuckDB connexion
1383        if connexion_format in ["duckdb"]:
1384
1385            # Database already exists
1386            if self.input_format in ["db", "duckdb"]:
1387
1388                if connexion_format in ["duckdb"]:
1389                    log.debug(f"Input file format '{self.input_format}' duckDB")
1390                else:
1391                    log.error(
1392                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1393                    )
1394                    raise ValueError(
1395                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1396                    )
1397
1398            # Load from existing database format
1399            else:
1400
1401                try:
1402                    # Create Table or View
1403                    database = Database(database=self.input)
1404                    sql_from = database.get_sql_from(sample_size=sample_size)
1405
1406                    if access in ["RO"]:
1407                        sql_load = (
1408                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
1409                        )
1410                    else:
1411                        sql_load = (
1412                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
1413                        )
1414                    self.conn.execute(sql_load)
1415
1416                except:
1417                    # Format not available
1418                    log.error(f"Input file format '{self.input_format}' not available")
1419                    raise ValueError(
1420                        f"Input file format '{self.input_format}' not available"
1421                    )
1422
1423        # SQLite connexion
1424        elif connexion_format in ["sqlite"] and input_format in [
1425            "vcf",
1426            "tsv",
1427            "csv",
1428            "psv",
1429        ]:
1430
1431            # Main structure
1432            structure = {
1433                "#CHROM": "VARCHAR",
1434                "POS": "INTEGER",
1435                "ID": "VARCHAR",
1436                "REF": "VARCHAR",
1437                "ALT": "VARCHAR",
1438                "QUAL": "VARCHAR",
1439                "FILTER": "VARCHAR",
1440                "INFO": "VARCHAR",
1441            }
1442
1443            # Strcuture with samples
1444            structure_complete = structure
1445            if self.get_header_sample_list():
1446                structure["FORMAT"] = "VARCHAR"
1447                for sample in self.get_header_sample_list():
1448                    structure_complete[sample] = "VARCHAR"
1449
1450            # Columns list for create and insert
1451            sql_create_table_columns = []
1452            sql_create_table_columns_list = []
1453            for column in structure_complete:
1454                column_type = structure_complete[column]
1455                sql_create_table_columns.append(
1456                    f'"{column}" {column_type} default NULL'
1457                )
1458                sql_create_table_columns_list.append(f'"{column}"')
1459
1460            # Create database
1461            log.debug(f"Create Table {table_variants}")
1462            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
1463            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
1464            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
1465            self.conn.execute(sql_create_table)
1466
1467            # chunksize define length of file chunk load file
1468            chunksize = 100000
1469
1470            # delimiter
1471            delimiter = file_format_delimiters.get(input_format, "\t")
1472
1473            # Load the input file
1474            with open(self.input, "rt") as input_file:
1475
1476                # Use the appropriate file handler based on the input format
1477                if input_compressed:
1478                    input_file = bgzf.open(self.input, "rt")
1479                if input_format in ["vcf"]:
1480                    header_len = self.get_header_length()
1481                else:
1482                    header_len = 0
1483
1484                # Insert the file contents into a table
1485                self.insert_file_to_table(
1486                    input_file,
1487                    columns=sql_create_table_columns_list_sql,
1488                    header_len=header_len,
1489                    sep=delimiter,
1490                    chunksize=chunksize,
1491                )
1492
1493        else:
1494            log.error(
1495                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1496            )
1497            raise ValueError(
1498                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1499            )
1500
1501        # Explode INFOS fields into table fields
1502        if self.get_explode_infos():
1503            self.explode_infos(
1504                prefix=self.get_explode_infos_prefix(),
1505                fields=self.get_explode_infos_fields(),
1506                force=True,
1507            )
1508
1509        # Create index after insertion
1510        self.create_indexes()

The load_data function reads a VCF file and inserts it into a table, with options to drop the table before loading the data and specify a sample size.

Parameters
  • input_file: The path to the input file. This is the VCF file that will be loaded into the table
  • drop_variants_table: The drop_variants_table parameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set to True, the variants table will be dropped. If set to False (default), the variants table will not be dropped, defaults to False
  • sample_size: The sample_size parameter determines the number of rows to be sampled from the input file. If it is set to None, the default value of 20480 will be used, defaults to 20480
def get_explode_infos(self) -> bool:
1512    def get_explode_infos(self) -> bool:
1513        """
1514        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
1515        to False if it is not set.
1516        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
1517        value. If the parameter is not present, it will return False.
1518        """
1519
1520        return self.get_param().get("explode", {}).get("explode_infos", False)

The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting to False if it is not set.

Returns

The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.

def get_explode_infos_fields( self, explode_infos_fields: str = None, remove_fields_not_in_header: bool = False) -> list:
1522    def get_explode_infos_fields(
1523        self,
1524        explode_infos_fields: str = None,
1525        remove_fields_not_in_header: bool = False,
1526    ) -> list:
1527        """
1528        The `get_explode_infos_fields` function returns a list of exploded information fields based on
1529        the input parameter `explode_infos_fields`.
1530
1531        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
1532        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
1533        comma-separated list of field names to explode
1534        :type explode_infos_fields: str
1535        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
1536        flag that determines whether to remove fields that are not present in the header. If it is set
1537        to `True`, any field that is not in the header will be excluded from the list of exploded
1538        information fields. If it is set to `, defaults to False
1539        :type remove_fields_not_in_header: bool (optional)
1540        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
1541        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
1542        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
1543        Otherwise, it returns a list of exploded information fields after removing any spaces and
1544        splitting the string by commas.
1545        """
1546
1547        # If no fields, get it in param
1548        if not explode_infos_fields:
1549            explode_infos_fields = (
1550                self.get_param().get("explode", {}).get("explode_infos_fields", None)
1551            )
1552
1553        # If no fields, defined as all fields in header using keyword
1554        if not explode_infos_fields:
1555            explode_infos_fields = "*"
1556
1557        # If fields list not empty
1558        if explode_infos_fields:
1559
1560            # Input fields list
1561            if isinstance(explode_infos_fields, str):
1562                fields_input = explode_infos_fields.split(",")
1563            elif isinstance(explode_infos_fields, list):
1564                fields_input = explode_infos_fields
1565            else:
1566                fields_input = []
1567
1568            # Fields list without * keyword
1569            fields_without_all = fields_input.copy()
1570            if "*".casefold() in (item.casefold() for item in fields_without_all):
1571                fields_without_all.remove("*")
1572
1573            # Fields in header
1574            fields_in_header = sorted(list(set(self.get_header().infos)))
1575
1576            # Construct list of fields
1577            fields_output = []
1578            for field in fields_input:
1579
1580                # Strip field
1581                field = field.strip()
1582
1583                # format keyword * in regex
1584                if field.upper() in ["*"]:
1585                    field = ".*"
1586
1587                # Find all fields with pattern
1588                r = re.compile(field)
1589                fields_search = sorted(list(filter(r.match, fields_in_header)))
1590
1591                # Remove fields input from search
1592                if field in fields_search:
1593                    fields_search = [field]
1594                elif fields_search != [field]:
1595                    fields_search = sorted(
1596                        list(set(fields_search).difference(fields_input))
1597                    )
1598
1599                # If field is not in header (avoid not well formatted header)
1600                if not fields_search and not remove_fields_not_in_header:
1601                    fields_search = [field]
1602
1603                # Add found fields
1604                for new_field in fields_search:
1605                    # Add field, if not already exists, and if it is in header (if asked)
1606                    if (
1607                        new_field not in fields_output
1608                        and (
1609                            not remove_fields_not_in_header
1610                            or new_field in fields_in_header
1611                        )
1612                        and new_field not in [".*"]
1613                    ):
1614                        fields_output.append(new_field)
1615
1616            return fields_output
1617
1618        else:
1619
1620            return []

The get_explode_infos_fields function returns a list of exploded information fields based on the input parameter explode_infos_fields.

Parameters
  • explode_infos_fields: The explode_infos_fields parameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode
  • remove_fields_not_in_header: The parameter remove_fields_not_in_header is a boolean flag that determines whether to remove fields that are not present in the header. If it is set to True, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns

The function get_explode_infos_fields returns a list of exploded information fields. If the explode_infos_fields parameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.

def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1622    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1623        """
1624        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
1625        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
1626        not provided.
1627
1628        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
1629        prefix to be used for exploding or expanding information
1630        :type explode_infos_prefix: str
1631        :return: the value of the variable `explode_infos_prefix`.
1632        """
1633
1634        if not explode_infos_prefix:
1635            explode_infos_prefix = (
1636                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
1637            )
1638
1639        return explode_infos_prefix

The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is not provided.

Parameters
  • explode_infos_prefix: The parameter explode_infos_prefix is a string that specifies a prefix to be used for exploding or expanding information
Returns

the value of the variable explode_infos_prefix.

def add_column( self, table_name, column_name, column_type, default_value=None, drop: bool = False) -> dict:
1641    def add_column(
1642        self,
1643        table_name,
1644        column_name,
1645        column_type,
1646        default_value=None,
1647        drop: bool = False,
1648    ) -> dict:
1649        """
1650        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
1651        doesn't already exist.
1652
1653        :param table_name: The name of the table to which you want to add a column
1654        :param column_name: The parameter "column_name" is the name of the column that you want to add
1655        to the table
1656        :param column_type: The `column_type` parameter specifies the data type of the column that you
1657        want to add to the table. It should be a string that represents the desired data type, such as
1658        "INTEGER", "TEXT", "REAL", etc
1659        :param default_value: The `default_value` parameter is an optional parameter that specifies the
1660        default value for the newly added column. If a default value is provided, it will be assigned to
1661        the column for any existing rows that do not have a value for that column
1662        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
1663        if it already exists in the table. If `drop` is set to `True`, the function will drop the
1664        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
1665        to False
1666        :type drop: bool (optional)
1667        :return: a boolean value indicating whether the column was successfully added to the table.
1668        """
1669
1670        # added
1671        added = False
1672        dropped = False
1673
1674        # Check if the column already exists in the table
1675        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1676        columns = self.get_query_to_df(query).columns.tolist()
1677        if column_name.upper() in [c.upper() for c in columns]:
1678            log.debug(
1679                f"The {column_name} column already exists in the {table_name} table"
1680            )
1681            if drop:
1682                self.drop_column(table_name=table_name, column_name=column_name)
1683                dropped = True
1684            else:
1685                return None
1686        else:
1687            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1688
1689        # Add column in table
1690        add_column_query = (
1691            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
1692        )
1693        if default_value is not None:
1694            add_column_query += f" DEFAULT {default_value}"
1695        self.execute_query(add_column_query)
1696        added = not dropped
1697        log.debug(
1698            f"The {column_name} column was successfully added to the {table_name} table"
1699        )
1700
1701        if added:
1702            added_column = {
1703                "table_name": table_name,
1704                "column_name": column_name,
1705                "column_type": column_type,
1706                "default_value": default_value,
1707            }
1708        else:
1709            added_column = None
1710
1711        return added_column

The add_column function adds a column to a SQLite or DuckDB table with a default value if it doesn't already exist.

Parameters
  • table_name: The name of the table to which you want to add a column
  • column_name: The parameter "column_name" is the name of the column that you want to add to the table
  • column_type: The column_type parameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc
  • default_value: The default_value parameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column
  • drop: The drop parameter is a boolean flag that determines whether to drop the column if it already exists in the table. If drop is set to True, the function will drop the existing column before adding the new column. If drop is set to False (default),, defaults to False
Returns

a boolean value indicating whether the column was successfully added to the table.

def drop_column( self, column: dict = None, table_name: str = None, column_name: str = None) -> bool:
1713    def drop_column(
1714        self, column: dict = None, table_name: str = None, column_name: str = None
1715    ) -> bool:
1716        """
1717        The `drop_column` function drops a specified column from a given table in a database and returns
1718        True if the column was successfully dropped, and False if the column does not exist in the
1719        table.
1720
1721        :param column: The `column` parameter is a dictionary that contains information about the column
1722        you want to drop. It has two keys:
1723        :type column: dict
1724        :param table_name: The `table_name` parameter is the name of the table from which you want to
1725        drop a column
1726        :type table_name: str
1727        :param column_name: The `column_name` parameter is the name of the column that you want to drop
1728        from the table
1729        :type column_name: str
1730        :return: a boolean value. It returns True if the column was successfully dropped from the table,
1731        and False if the column does not exist in the table.
1732        """
1733
1734        # Find column infos
1735        if column:
1736            if isinstance(column, dict):
1737                table_name = column.get("table_name", None)
1738                column_name = column.get("column_name", None)
1739            elif isinstance(column, str):
1740                table_name = self.get_table_variants()
1741                column_name = column
1742            else:
1743                table_name = None
1744                column_name = None
1745
1746        if not table_name and not column_name:
1747            return False
1748
1749        # Removed
1750        removed = False
1751
1752        # Check if the column already exists in the table
1753        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1754        columns = self.get_query_to_df(query).columns.tolist()
1755        if column_name in columns:
1756            log.debug(f"The {column_name} column exists in the {table_name} table")
1757        else:
1758            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1759            return False
1760
1761        # Add column in table # ALTER TABLE integers DROP k
1762        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
1763        self.execute_query(add_column_query)
1764        removed = True
1765        log.debug(
1766            f"The {column_name} column was successfully dropped to the {table_name} table"
1767        )
1768
1769        return removed

The drop_column function drops a specified column from a given table in a database and returns True if the column was successfully dropped, and False if the column does not exist in the table.

Parameters
  • column: The column parameter is a dictionary that contains information about the column you want to drop. It has two keys:
  • table_name: The table_name parameter is the name of the table from which you want to drop a column
  • column_name: The column_name parameter is the name of the column that you want to drop from the table
Returns

a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.

def explode_infos( self, prefix: str = None, create_index: bool = False, fields: list = None, force: bool = False, proccess_all_fields_together: bool = False, table: str = None) -> list:
1771    def explode_infos(
1772        self,
1773        prefix: str = None,
1774        create_index: bool = False,
1775        fields: list = None,
1776        force: bool = False,
1777        proccess_all_fields_together: bool = False,
1778        table: str = None,
1779    ) -> list:
1780        """
1781        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
1782        individual columns, returning a list of added columns.
1783
1784        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
1785        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
1786        `self.get_explode_infos_prefix()` as the prefix
1787        :type prefix: str
1788        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
1789        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
1790        `False`, indexes will not be created. The default value is `False`, defaults to False
1791        :type create_index: bool (optional)
1792        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
1793        that you want to explode into individual columns. If this parameter is not provided, all INFO
1794        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
1795        a list to the `
1796        :type fields: list
1797        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
1798        determines whether to drop and recreate a column if it already exists in the table. If `force`
1799        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
1800        defaults to False
1801        :type force: bool (optional)
1802        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
1803        flag that determines whether to process all the INFO fields together or individually. If set to
1804        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
1805        be processed individually. The default value is, defaults to False
1806        :type proccess_all_fields_together: bool (optional)
1807        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
1808        of the table where the exploded INFO fields will be added as individual columns. If you provide
1809        a value for the `table` parameter, the function will use that table name. If the `table`
1810        parameter is
1811        :type table: str
1812        :return: The `explode_infos` function returns a list of added columns.
1813        """
1814
1815        # drop indexes
1816        self.drop_indexes()
1817
1818        # connexion format
1819        connexion_format = self.get_connexion_format()
1820
1821        # Access
1822        access = self.get_config().get("access", None)
1823
1824        # Added columns
1825        added_columns = []
1826
1827        if access not in ["RO"]:
1828
1829            # prefix
1830            if prefix in [None, True] or not isinstance(prefix, str):
1831                if self.get_explode_infos_prefix() not in [None, True]:
1832                    prefix = self.get_explode_infos_prefix()
1833                else:
1834                    prefix = "INFO/"
1835
1836            # table variants
1837            if table is not None:
1838                table_variants = table
1839            else:
1840                table_variants = self.get_table_variants(clause="select")
1841
1842            # extra infos
1843            try:
1844                extra_infos = self.get_extra_infos()
1845            except:
1846                extra_infos = []
1847
1848            # Header infos
1849            header_infos = self.get_header().infos
1850
1851            log.debug(
1852                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
1853            )
1854
1855            sql_info_alter_table_array = []
1856
1857            # Info fields to check
1858            fields_list = list(header_infos)
1859            if fields:
1860                fields_list += fields
1861            fields_list = set(fields_list)
1862
1863            # If no fields
1864            if not fields:
1865                fields = []
1866
1867            # Translate fields if patterns
1868            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
1869
1870            for info in fields:
1871
1872                info_id_sql = prefix + info
1873
1874                if (
1875                    info in fields_list
1876                    or prefix + info in fields_list
1877                    or info in extra_infos
1878                ):
1879
1880                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
1881
1882                    if info in header_infos:
1883                        info_type = header_infos[info].type
1884                        info_num = header_infos[info].num
1885                    else:
1886                        info_type = "String"
1887                        info_num = 0
1888
1889                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
1890                    if info_num != 1:
1891                        type_sql = "VARCHAR"
1892
1893                    # Add field
1894                    added_column = self.add_column(
1895                        table_name=table_variants,
1896                        column_name=info_id_sql,
1897                        column_type=type_sql,
1898                        default_value="null",
1899                        drop=force,
1900                    )
1901
1902                    if added_column:
1903                        added_columns.append(added_column)
1904
1905                    if added_column or force:
1906
1907                        # add field to index
1908                        self.index_additionnal_fields.append(info_id_sql)
1909
1910                        # Update field array
1911                        if connexion_format in ["duckdb"]:
1912                            update_info_field = f"""
1913                            "{info_id_sql}" =
1914                                CASE
1915                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
1916                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
1917                                END
1918                            """
1919                        elif connexion_format in ["sqlite"]:
1920                            update_info_field = f"""
1921                                "{info_id_sql}" =
1922                                    CASE
1923                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
1924                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
1925                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
1926                                    END
1927                            """
1928
1929                        sql_info_alter_table_array.append(update_info_field)
1930
1931            if sql_info_alter_table_array:
1932
1933                # By chromosomes
1934                try:
1935                    chromosomes_list = list(
1936                        self.get_query_to_df(
1937                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
1938                        )["#CHROM"]
1939                    )
1940                except:
1941                    chromosomes_list = [None]
1942
1943                for chrom in chromosomes_list:
1944                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
1945
1946                    # Where clause
1947                    where_clause = ""
1948                    if chrom and len(chromosomes_list) > 1:
1949                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
1950
1951                    # Update table
1952                    if proccess_all_fields_together:
1953                        sql_info_alter_table_array_join = ", ".join(
1954                            sql_info_alter_table_array
1955                        )
1956                        if sql_info_alter_table_array_join:
1957                            sql_info_alter_table = f"""
1958                                UPDATE {table_variants}
1959                                SET {sql_info_alter_table_array_join}
1960                                {where_clause}
1961                                """
1962                            log.debug(
1963                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
1964                            )
1965                            # log.debug(sql_info_alter_table)
1966                            self.conn.execute(sql_info_alter_table)
1967                    else:
1968                        sql_info_alter_num = 0
1969                        for sql_info_alter in sql_info_alter_table_array:
1970                            sql_info_alter_num += 1
1971                            sql_info_alter_table = f"""
1972                                UPDATE {table_variants}
1973                                SET {sql_info_alter}
1974                                {where_clause}
1975                                """
1976                            log.debug(
1977                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
1978                            )
1979                            # log.debug(sql_info_alter_table)
1980                            self.conn.execute(sql_info_alter_table)
1981
1982        # create indexes
1983        if create_index:
1984            self.create_indexes()
1985
1986        return added_columns

The explode_infos function in Python takes a VCF file and explodes the INFO fields into individual columns, returning a list of added columns.

Parameters
  • prefix: The prefix parameter is a string that is used as a prefix for the exploded INFO fields. If the prefix is not provided or is set to None, the function will use the value of self.get_explode_infos_prefix() as the prefix
  • create_index: The create_index parameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set to True, indexes will be created; if set to False, indexes will not be created. The default value is False, defaults to False
  • fields: The fields parameter in the explode_infos function is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded. You can specify the INFO fields you want to explode by passing them as a list to the `
  • force: The force parameter in the explode_infos function is a boolean flag that determines whether to drop and recreate a column if it already exists in the table. If force is set to True, the column will be dropped and recreated. If force is set to `False, defaults to False
  • proccess_all_fields_together: The proccess_all_fields_together parameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set to True, all the INFO fields will be processed together. If set to False, each INFO field will be processed individually. The default value is, defaults to False
  • table: The table parameter in the explode_infos function is used to specify the name of the table where the exploded INFO fields will be added as individual columns. If you provide a value for the table parameter, the function will use that table name. If the table parameter is
Returns

The explode_infos function returns a list of added columns.

def create_indexes(self) -> None:
1988    def create_indexes(self) -> None:
1989        """
1990        Create indexes on the table after insertion
1991        """
1992
1993        # Access
1994        access = self.get_config().get("access", None)
1995
1996        # get table variants
1997        table_variants = self.get_table_variants("FROM")
1998
1999        if self.get_indexing() and access not in ["RO"]:
2000            # Create index
2001            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
2002            self.conn.execute(sql_create_table_index)
2003            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
2004            self.conn.execute(sql_create_table_index)
2005            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
2006            self.conn.execute(sql_create_table_index)
2007            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
2008            self.conn.execute(sql_create_table_index)
2009            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
2010            self.conn.execute(sql_create_table_index)
2011            for field in self.index_additionnal_fields:
2012                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
2013                self.conn.execute(sql_create_table_index)

Create indexes on the table after insertion

def drop_indexes(self) -> None:
2015    def drop_indexes(self) -> None:
2016        """
2017        Create indexes on the table after insertion
2018        """
2019
2020        # Access
2021        access = self.get_config().get("access", None)
2022
2023        # get table variants
2024        table_variants = self.get_table_variants("FROM")
2025
2026        # Get database format
2027        connexion_format = self.get_connexion_format()
2028
2029        if access not in ["RO"]:
2030            if connexion_format in ["duckdb"]:
2031                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
2032            elif connexion_format in ["sqlite"]:
2033                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
2034
2035            list_indexes = self.conn.execute(sql_list_indexes)
2036            index_names = [row[0] for row in list_indexes.fetchall()]
2037            for index in index_names:
2038                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
2039                self.conn.execute(sql_drop_table_index)

Create indexes on the table after insertion

def read_vcf_header(self, f) -> list:
2041    def read_vcf_header(self, f) -> list:
2042        """
2043        It reads the header of a VCF file and returns a list of the header lines
2044
2045        :param f: the file object
2046        :return: The header lines of the VCF file.
2047        """
2048
2049        header_list = []
2050        for line in f:
2051            header_list.append(line)
2052            if line.startswith("#CHROM"):
2053                break
2054        return header_list

It reads the header of a VCF file and returns a list of the header lines

Parameters
  • f: the file object
Returns

The header lines of the VCF file.

def read_vcf_header_file(self, file: str = None) -> list:
2056    def read_vcf_header_file(self, file: str = None) -> list:
2057        """
2058        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
2059        uncompressed files.
2060
2061        :param file: The `file` parameter is a string that represents the path to the VCF header file
2062        that you want to read. It is an optional parameter, so if you don't provide a value, it will
2063        default to `None`
2064        :type file: str
2065        :return: The function `read_vcf_header_file` returns a list.
2066        """
2067
2068        if self.get_input_compressed(input_file=file):
2069            with bgzf.open(file, "rt") as f:
2070                return self.read_vcf_header(f=f)
2071        else:
2072            with open(file, "rt") as f:
2073                return self.read_vcf_header(f=f)

The read_vcf_header_file function reads the header of a VCF file, handling both compressed and uncompressed files.

Parameters
  • file: The file parameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default to None
Returns

The function read_vcf_header_file returns a list.

def execute_query(self, query: str):
2075    def execute_query(self, query: str):
2076        """
2077        It takes a query as an argument, executes it, and returns the results
2078
2079        :param query: The query to be executed
2080        :return: The result of the query is being returned.
2081        """
2082        if query:
2083            return self.conn.execute(query)  # .fetchall()
2084        else:
2085            return None

It takes a query as an argument, executes it, and returns the results

Parameters
  • query: The query to be executed
Returns

The result of the query is being returned.

def export_output( self, output_file: str | None = None, output_header: str | None = None, export_header: bool = True, query: str | None = None, parquet_partitions: list | None = None, chunk_size: int | None = None, threads: int | None = None, sort: bool = False, index: bool = False, order_by: str | None = None, fields_to_rename: dict | None = None) -> bool:
2087    def export_output(
2088        self,
2089        output_file: str | None = None,
2090        output_header: str | None = None,
2091        export_header: bool = True,
2092        query: str | None = None,
2093        parquet_partitions: list | None = None,
2094        chunk_size: int | None = None,
2095        threads: int | None = None,
2096        sort: bool = False,
2097        index: bool = False,
2098        order_by: str | None = None,
2099        fields_to_rename: dict | None = None
2100    ) -> bool:
2101        """
2102        The `export_output` function exports data from a VCF file to various formats, including VCF,
2103        CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and
2104        partitioning.
2105        
2106        :param output_file: The `output_file` parameter is a string that specifies the name of the
2107        output file where the exported data will be saved
2108        :type output_file: str | None
2109        :param output_header: The `output_header` parameter is a string that specifies the name of the
2110        file where the header of the VCF file will be exported. If this parameter is not provided, the
2111        header will be exported to a file with the same name as the `output_file` parameter, but with
2112        the extension "
2113        :type output_header: str | None
2114        :param export_header: The `export_header` parameter is a boolean flag that determines whether
2115        the header of a VCF file should be exported to a separate file or not. If `export_header` is
2116        True, the header will be exported to a file. If `export_header` is False, the header will not
2117        be, defaults to True
2118        :type export_header: bool (optional)
2119        :param query: The `query` parameter in the `export_output` function is an optional SQL query
2120        that can be used to filter and select specific data from the VCF file before exporting it. If
2121        provided, only the data that matches the query will be exported. This allows you to customize
2122        the exported data based on
2123        :type query: str | None
2124        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
2125        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
2126        organize data in a hierarchical directory structure based on the values of one or more columns.
2127        This can improve query performance when working with large datasets
2128        :type parquet_partitions: list | None
2129        :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when
2130        exporting data in Parquet format. This parameter is used for partitioning the Parquet file into
2131        multiple files. It helps in optimizing the export process by breaking down the data into
2132        manageable chunks for processing and storage
2133        :type chunk_size: int | None
2134        :param threads: The `threads` parameter in the `export_output` function specifies the number of
2135        threads to be used during the export process. It determines the level of parallelism and can
2136        improve the performance of the export operation. If this parameter is not provided, the function
2137        will use the default number of threads
2138        :type threads: int | None
2139        :param sort: The `sort` parameter in the `export_output` function is a boolean flag that
2140        determines whether the output file should be sorted based on genomic coordinates of the
2141        variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to
2142        `False`,, defaults to False
2143        :type sort: bool (optional)
2144        :param index: The `index` parameter in the `export_output` function is a boolean flag that
2145        determines whether an index should be created on the output file. If `index` is set to `True`,
2146        an index will be created on the output file. If `index` is set to `False`, no, defaults to False
2147        :type index: bool (optional)
2148        :param order_by: The `order_by` parameter in the `export_output` function is a string that
2149        specifies the column(s) to use for sorting the output file. This parameter is only applicable
2150        when exporting data in VCF format. It allows you to specify the column(s) based on which the
2151        output file should be
2152        :type order_by: str | None
2153        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the
2154        mapping of field names to be renamed during the export process. This parameter allows you to
2155        customize the output field names before exporting the data. Each key-value pair in the
2156        dictionary represents the original field name as the key and the new field name
2157        :type fields_to_rename: dict | None
2158        :return: The `export_output` function returns a boolean value. It checks if the output file
2159        exists and returns True if it does, or None if it doesn't.
2160        """
2161
2162        # Log
2163        log.info("Exporting...")
2164
2165        # Full path
2166        output_file = full_path(output_file)
2167        output_header = full_path(output_header)
2168
2169        # Config
2170        config = self.get_config()
2171
2172        # Param
2173        param = self.get_param()
2174
2175        # Tmp files to remove
2176        tmp_to_remove = []
2177
2178        # If no output, get it
2179        if not output_file:
2180            output_file = self.get_output()
2181
2182        # If not threads
2183        if not threads:
2184            threads = self.get_threads()
2185
2186        # Rename fields
2187        if not fields_to_rename:
2188            fields_to_rename = param.get("export", {}).get("fields_to_rename", None)
2189        self.rename_info_fields(fields_to_rename=fields_to_rename)
2190
2191        # Auto header name with extension
2192        if export_header or output_header:
2193            if not output_header:
2194                output_header = f"{output_file}.hdr"
2195            # Export header
2196            self.export_header(output_file=output_file)
2197
2198        # Switch off export header if VCF output
2199        output_file_type = get_file_format(output_file)
2200        if output_file_type in ["vcf"]:
2201            export_header = False
2202            tmp_to_remove.append(output_header)
2203
2204        # Chunk size
2205        if not chunk_size:
2206            chunk_size = config.get("chunk_size", None)
2207
2208        # Parquet partition
2209        if not parquet_partitions:
2210            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
2211        if parquet_partitions and isinstance(parquet_partitions, str):
2212            parquet_partitions = parquet_partitions.split(",")
2213
2214        # Order by
2215        if not order_by:
2216            order_by = param.get("export", {}).get("order_by", "")
2217
2218        # Header in output
2219        header_in_output = param.get("export", {}).get("include_header", False)
2220
2221        # Database
2222        database_source = self.get_connexion()
2223
2224        # Connexion format
2225        connexion_format = self.get_connexion_format()
2226
2227        # Explode infos
2228        if self.get_explode_infos():
2229            self.explode_infos(
2230                prefix=self.get_explode_infos_prefix(),
2231                fields=self.get_explode_infos_fields(),
2232                force=False,
2233            )
2234
2235        # if connexion_format in ["sqlite"] or query:
2236        if connexion_format in ["sqlite"]:
2237
2238            # Export in Parquet
2239            random_tmp = "".join(
2240                random.choice(string.ascii_lowercase) for i in range(10)
2241            )
2242            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
2243            tmp_to_remove.append(database_source)
2244
2245            # Table Variants
2246            table_variants = self.get_table_variants()
2247
2248            # Create export query
2249            sql_query_export_subquery = f"""
2250                SELECT * FROM {table_variants}
2251                """
2252
2253            # Write source file
2254            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
2255
2256        # Create database
2257        database = Database(
2258            database=database_source,
2259            table="variants",
2260            header_file=output_header,
2261            conn_config=self.get_connexion_config(),
2262        )
2263
2264        # Existing colomns header
2265        existing_columns_header = database.get_header_columns_from_database(query=query)
2266
2267        # Sample list
2268        if output_file_type in ["vcf"]:
2269            get_samples = self.get_samples()
2270            get_samples_check = self.get_samples_check()
2271            samples_force = get_samples is not None
2272            sample_list = self.get_header_sample_list(
2273                check=get_samples_check,
2274                samples=get_samples,
2275                samples_force=samples_force,
2276            )
2277        else:
2278            sample_list = None
2279
2280        # Export file
2281        database.export(
2282            output_database=output_file,
2283            output_header=output_header,
2284            existing_columns_header=existing_columns_header,
2285            parquet_partitions=parquet_partitions,
2286            chunk_size=chunk_size,
2287            threads=threads,
2288            sort=sort,
2289            index=index,
2290            header_in_output=header_in_output,
2291            order_by=order_by,
2292            query=query,
2293            export_header=export_header,
2294            sample_list=sample_list,
2295        )
2296
2297        # Remove
2298        remove_if_exists(tmp_to_remove)
2299
2300        return (os.path.exists(output_file) or None) and (
2301            os.path.exists(output_file) or None
2302        )

The export_output function exports data from a VCF file to various formats, including VCF, CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and partitioning.

Parameters
  • output_file: The output_file parameter is a string that specifies the name of the output file where the exported data will be saved
  • output_header: The output_header parameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as the output_file parameter, but with the extension "
  • export_header: The export_header parameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. If export_header is True, the header will be exported to a file. If export_header is False, the header will not be, defaults to True
  • query: The query parameter in the export_output function is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported. This allows you to customize the exported data based on
  • parquet_partitions: The parquet_partitions parameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets
  • chunk_size: The chunk_size parameter specifies the number of records in a batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files. It helps in optimizing the export process by breaking down the data into manageable chunks for processing and storage
  • threads: The threads parameter in the export_output function specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If this parameter is not provided, the function will use the default number of threads
  • sort: The sort parameter in the export_output function is a boolean flag that determines whether the output file should be sorted based on genomic coordinates of the variants. If sort is set to True, the output file will be sorted. If sort is set to False,, defaults to False
  • index: The index parameter in the export_output function is a boolean flag that determines whether an index should be created on the output file. If index is set to True, an index will be created on the output file. If index is set to False, no, defaults to False
  • order_by: The order_by parameter in the export_output function is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format. It allows you to specify the column(s) based on which the output file should be
  • fields_to_rename: The fields_to_rename parameter is a dictionary that specifies the mapping of field names to be renamed during the export process. This parameter allows you to customize the output field names before exporting the data. Each key-value pair in the dictionary represents the original field name as the key and the new field name
Returns

The export_output function returns a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.

def get_extra_infos(self, table: str = None) -> list:
2304    def get_extra_infos(self, table: str = None) -> list:
2305        """
2306        The `get_extra_infos` function returns a list of columns that are in a specified table but not
2307        in the header.
2308
2309        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
2310        name of the table from which you want to retrieve the extra columns that are not present in the
2311        header. If the `table` parameter is not provided when calling the function, it will default to
2312        using the variants
2313        :type table: str
2314        :return: A list of columns that are in the specified table but not in the header of the table.
2315        """
2316
2317        header_columns = []
2318
2319        if not table:
2320            table = self.get_table_variants(clause="from")
2321            header_columns = self.get_header_columns()
2322
2323        # Check all columns in the database
2324        query = f""" SELECT * FROM {table} LIMIT 1 """
2325        log.debug(f"query {query}")
2326        table_columns = self.get_query_to_df(query).columns.tolist()
2327        extra_columns = []
2328
2329        # Construct extra infos (not in header)
2330        for column in table_columns:
2331            if column not in header_columns:
2332                extra_columns.append(column)
2333
2334        return extra_columns

The get_extra_infos function returns a list of columns that are in a specified table but not in the header.

Parameters
  • table: The table parameter in the get_extra_infos function is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If the table parameter is not provided when calling the function, it will default to using the variants
Returns

A list of columns that are in the specified table but not in the header of the table.

def get_extra_infos_sql(self, table: str = None) -> str:
2336    def get_extra_infos_sql(self, table: str = None) -> str:
2337        """
2338        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
2339        by double quotes
2340
2341        :param table: The name of the table to get the extra infos from. If None, the default table is
2342        used
2343        :type table: str
2344        :return: A string of the extra infos
2345        """
2346
2347        return ", ".join(
2348            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
2349        )

It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes

Parameters
  • table: The name of the table to get the extra infos from. If None, the default table is used
Returns

A string of the extra infos

def export_header( self, header_name: str = None, output_file: str = None, output_file_ext: str = '.hdr', clean_header: bool = True, remove_chrom_line: bool = False) -> str:
2351    def export_header(
2352        self,
2353        header_name: str = None,
2354        output_file: str = None,
2355        output_file_ext: str = ".hdr",
2356        clean_header: bool = True,
2357        remove_chrom_line: bool = False,
2358    ) -> str:
2359        """
2360        The `export_header` function takes a VCF file, extracts the header, modifies it according to
2361        specified options, and writes it to a new file.
2362
2363        :param header_name: The `header_name` parameter is the name of the header file to be created. If
2364        this parameter is not specified, the header will be written to the output file
2365        :type header_name: str
2366        :param output_file: The `output_file` parameter in the `export_header` function is used to
2367        specify the name of the output file where the header will be written. If this parameter is not
2368        provided, the header will be written to a temporary file
2369        :type output_file: str
2370        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
2371        string that represents the extension of the output header file. By default, it is set to ".hdr"
2372        if not specified by the user. This extension will be appended to the `output_file` name to
2373        create the final, defaults to .hdr
2374        :type output_file_ext: str (optional)
2375        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
2376        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
2377        `True`, the function will clean the header by modifying certain lines based on a specific
2378        pattern. If `clean_header`, defaults to True
2379        :type clean_header: bool (optional)
2380        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
2381        boolean flag that determines whether the #CHROM line should be removed from the header before
2382        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
2383        defaults to False
2384        :type remove_chrom_line: bool (optional)
2385        :return: The function `export_header` returns the name of the temporary header file that is
2386        created.
2387        """
2388
2389        if not header_name and not output_file:
2390            output_file = self.get_output()
2391
2392        if self.get_header():
2393
2394            # Get header object
2395            header_obj = self.get_header()
2396
2397            # Create database
2398            db_for_header = Database(database=self.get_input())
2399
2400            # Get real columns in the file
2401            db_header_columns = db_for_header.get_columns()
2402
2403            with tempfile.TemporaryDirectory() as tmpdir:
2404
2405                # Write header file
2406                header_file_tmp = os.path.join(tmpdir, "header")
2407                f = open(header_file_tmp, "w")
2408                vcf.Writer(f, header_obj)
2409                f.close()
2410
2411                # Replace #CHROM line with rel columns
2412                header_list = db_for_header.read_header_file(
2413                    header_file=header_file_tmp
2414                )
2415                header_list[-1] = "\t".join(db_header_columns)
2416
2417                # Remove CHROM line
2418                if remove_chrom_line:
2419                    header_list.pop()
2420
2421                # Clean header
2422                if clean_header:
2423                    header_list_clean = []
2424                    for head in header_list:
2425                        # Clean head for malformed header
2426                        head_clean = head
2427                        head_clean = re.subn(
2428                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
2429                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
2430                            head_clean,
2431                            2,
2432                        )[0]
2433                        # Write header
2434                        header_list_clean.append(head_clean)
2435                    header_list = header_list_clean
2436
2437            tmp_header_name = output_file + output_file_ext
2438
2439            f = open(tmp_header_name, "w")
2440            for line in header_list:
2441                f.write(line)
2442            f.close()
2443
2444        return tmp_header_name

The export_header function takes a VCF file, extracts the header, modifies it according to specified options, and writes it to a new file.

Parameters
  • header_name: The header_name parameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file
  • output_file: The output_file parameter in the export_header function is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file
  • output_file_ext: The output_file_ext parameter in the export_header function is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to the output_file name to create the final, defaults to .hdr
  • clean_header: The clean_header parameter in the export_header function is a boolean flag that determines whether the header should be cleaned or not. When clean_header is set to True, the function will clean the header by modifying certain lines based on a specific pattern. If clean_header, defaults to True
  • remove_chrom_line: The remove_chrom_line parameter in the export_header function is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set to True, the #CHROM line will be removed; if set to `, defaults to False
Returns

The function export_header returns the name of the temporary header file that is created.

def export_variant_vcf( self, vcf_file, remove_info: bool = False, add_samples: bool = True, list_samples: list = [], where_clause: str = '', index: bool = False, threads: int | None = None) -> bool | None:
2446    def export_variant_vcf(
2447        self,
2448        vcf_file,
2449        remove_info: bool = False,
2450        add_samples: bool = True,
2451        list_samples: list = [],
2452        where_clause: str = "",
2453        index: bool = False,
2454        threads: int | None = None,
2455    ) -> bool | None:
2456        """
2457        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
2458        remove INFO field, add samples, and control compression and indexing.
2459
2460        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
2461        written to. It is the output file that will contain the filtered VCF data based on the specified
2462        parameters
2463        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
2464        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
2465        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
2466        in, defaults to False
2467        :type remove_info: bool (optional)
2468        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
2469        the samples should be added to the VCF file or not. If set to True, the samples will be added.
2470        If set to False, the samples will be removed. The default value is True, defaults to True
2471        :type add_samples: bool (optional)
2472        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
2473        in the output VCF file. By default, all samples will be included. If you provide a list of
2474        samples, only those samples will be included in the output file
2475        :type list_samples: list
2476        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
2477        determines whether or not to create an index for the output VCF file. If `index` is set to
2478        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
2479        :type index: bool (optional)
2480        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
2481        number of threads to use for exporting the VCF file. It determines how many parallel threads
2482        will be used during the export process. More threads can potentially speed up the export process
2483        by utilizing multiple cores of the processor. If
2484        :type threads: int | None
2485        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
2486        method with various parameters including the output file, query, threads, sort flag, and index
2487        flag. The `export_output` method is responsible for exporting the VCF data based on the
2488        specified parameters and configurations provided in the `export_variant_vcf` function.
2489        """
2490
2491        # Config
2492        config = self.get_config()
2493
2494        # Extract VCF
2495        log.debug("Export VCF...")
2496
2497        # Table variants
2498        table_variants = self.get_table_variants()
2499
2500        # Threads
2501        if not threads:
2502            threads = self.get_threads()
2503
2504        # Info fields
2505        if remove_info:
2506            if not isinstance(remove_info, str):
2507                remove_info = "."
2508            info_field = f"""'{remove_info}' as INFO"""
2509        else:
2510            info_field = "INFO"
2511
2512        # Samples fields
2513        if add_samples:
2514            if not list_samples:
2515                list_samples = self.get_header_sample_list()
2516            if list_samples:
2517                samples_fields = " , FORMAT , " + " , ".join(list_samples)
2518            else:
2519                samples_fields = ""
2520            log.debug(f"samples_fields: {samples_fields}")
2521        else:
2522            samples_fields = ""
2523
2524        # Where clause
2525        if where_clause is None:
2526            where_clause = ""
2527
2528        # Variants
2529        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
2530        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
2531        log.debug(f"sql_query_select={sql_query_select}")
2532
2533        return self.export_output(
2534            output_file=vcf_file,
2535            output_header=None,
2536            export_header=True,
2537            query=sql_query_select,
2538            parquet_partitions=None,
2539            chunk_size=config.get("chunk_size", None),
2540            threads=threads,
2541            sort=True,
2542            index=index,
2543            order_by=None,
2544        )

The export_variant_vcf function exports a VCF file with specified samples, allowing options to remove INFO field, add samples, and control compression and indexing.

Parameters
  • vcf_file: The vcf_file parameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters
  • remove_info: The remove_info parameter in the export_variant_vcf function is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set to True, the INFO field will be removed. If set to False, the INFO field will be included in, defaults to False
  • add_samples: The add_samples parameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True
  • list_samples: The list_samples parameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file
  • index: The index parameter in the export_variant_vcf function is a boolean flag that determines whether or not to create an index for the output VCF file. If index is set to True, the output VCF file will be indexed using tabix. If index, defaults to False
  • threads: The threads parameter in the export_variant_vcf function specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns

The export_variant_vcf function returns the result of calling the export_output method with various parameters including the output file, query, threads, sort flag, and index flag. The export_output method is responsible for exporting the VCF data based on the specified parameters and configurations provided in the export_variant_vcf function.

def run_commands(self, commands: list = [], threads: int = 1) -> None:
2546    def run_commands(self, commands: list = [], threads: int = 1) -> None:
2547        """
2548        It takes a list of commands and runs them in parallel using the number of threads specified
2549
2550        :param commands: A list of commands to run
2551        :param threads: The number of threads to use, defaults to 1 (optional)
2552        """
2553
2554        run_parallel_commands(commands, threads)

It takes a list of commands and runs them in parallel using the number of threads specified

Parameters
  • commands: A list of commands to run
  • threads: The number of threads to use, defaults to 1 (optional)
def get_threads(self, default: int = 1) -> int:
2556    def get_threads(self, default: int = 1) -> int:
2557        """
2558        This function returns the number of threads to use for a job, with a default value of 1 if not
2559        specified.
2560
2561        :param default: The `default` parameter in the `get_threads` method is used to specify the
2562        default number of threads to use if no specific value is provided. If no value is provided for
2563        the `threads` parameter in the configuration or input parameters, the `default` value will be
2564        used, defaults to 1
2565        :type default: int (optional)
2566        :return: the number of threads to use for the current job.
2567        """
2568
2569        # Config
2570        config = self.get_config()
2571
2572        # Param
2573        param = self.get_param()
2574
2575        # Input threads
2576        input_thread = param.get("threads", config.get("threads", None))
2577
2578        # Check threads
2579        if not input_thread:
2580            threads = default
2581        elif int(input_thread) <= 0:
2582            threads = os.cpu_count()
2583        else:
2584            threads = int(input_thread)
2585        return threads

This function returns the number of threads to use for a job, with a default value of 1 if not specified.

Parameters
  • default: The default parameter in the get_threads method is used to specify the default number of threads to use if no specific value is provided. If no value is provided for the threads parameter in the configuration or input parameters, the default value will be used, defaults to 1
Returns

the number of threads to use for the current job.

def get_memory(self, default: str = None) -> str:
2587    def get_memory(self, default: str = None) -> str:
2588        """
2589        This function retrieves the memory value from parameters or configuration with a default value
2590        if not found.
2591
2592        :param default: The `get_memory` function takes in a default value as a string parameter. This
2593        default value is used as a fallback in case the `memory` parameter is not provided in the
2594        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
2595        the function
2596        :type default: str
2597        :return: The `get_memory` function returns a string value representing the memory parameter. If
2598        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
2599        return the default value provided as an argument to the function.
2600        """
2601
2602        # Config
2603        config = self.get_config()
2604
2605        # Param
2606        param = self.get_param()
2607
2608        # Input threads
2609        input_memory = param.get("memory", config.get("memory", None))
2610
2611        # Check threads
2612        if input_memory:
2613            memory = input_memory
2614        else:
2615            memory = default
2616
2617        return memory

This function retrieves the memory value from parameters or configuration with a default value if not found.

Parameters
  • default: The get_memory function takes in a default value as a string parameter. This default value is used as a fallback in case the memory parameter is not provided in the param dictionary or the config dictionary. If memory is not found in either dictionary, the function
Returns

The get_memory function returns a string value representing the memory parameter. If the input_memory is provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.

def update_from_vcf(self, vcf_file: str) -> None:
2619    def update_from_vcf(self, vcf_file: str) -> None:
2620        """
2621        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
2622
2623        :param vcf_file: the path to the VCF file
2624        """
2625
2626        connexion_format = self.get_connexion_format()
2627
2628        if connexion_format in ["duckdb"]:
2629            self.update_from_vcf_duckdb(vcf_file)
2630        elif connexion_format in ["sqlite"]:
2631            self.update_from_vcf_sqlite(vcf_file)

If the database is duckdb, then use the parquet method, otherwise use the sqlite method

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2633    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2634        """
2635        It takes a VCF file and updates the INFO column of the variants table in the database with the
2636        INFO column of the VCF file
2637
2638        :param vcf_file: the path to the VCF file
2639        """
2640
2641        # varaints table
2642        table_variants = self.get_table_variants()
2643
2644        # Loading VCF into temporaire table
2645        skip = self.get_header_length(file=vcf_file)
2646        vcf_df = pd.read_csv(
2647            vcf_file,
2648            sep="\t",
2649            engine="c",
2650            skiprows=skip,
2651            header=0,
2652            low_memory=False,
2653        )
2654        sql_query_update = f"""
2655        UPDATE {table_variants} as table_variants
2656            SET INFO = concat(
2657                            CASE
2658                                WHEN INFO NOT IN ('', '.')
2659                                THEN INFO
2660                                ELSE ''
2661                            END,
2662                            (
2663                                SELECT 
2664                                    concat(
2665                                        CASE
2666                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
2667                                            THEN ';'
2668                                            ELSE ''
2669                                        END
2670                                        ,
2671                                        CASE
2672                                            WHEN table_parquet.INFO NOT IN ('','.')
2673                                            THEN table_parquet.INFO
2674                                            ELSE ''
2675                                        END
2676                                    )
2677                                FROM vcf_df as table_parquet
2678                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
2679                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
2680                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
2681                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
2682                                        AND table_parquet.INFO NOT IN ('','.')
2683                            )
2684                        )
2685            ;
2686            """
2687        self.conn.execute(sql_query_update)

It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2689    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2690        """
2691        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
2692        table, then updates the INFO column of the variants table with the INFO column of the temporary
2693        table
2694
2695        :param vcf_file: The path to the VCF file you want to update the database with
2696        """
2697
2698        # Create a temporary table for the VCF
2699        table_vcf = "tmp_vcf"
2700        sql_create = (
2701            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
2702        )
2703        self.conn.execute(sql_create)
2704
2705        # Loading VCF into temporaire table
2706        vcf_df = pd.read_csv(
2707            vcf_file, sep="\t", comment="#", header=None, low_memory=False
2708        )
2709        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
2710        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
2711
2712        # Update table 'variants' with VCF data
2713        # warning: CONCAT as || operator
2714        sql_query_update = f"""
2715            UPDATE variants as table_variants
2716            SET INFO = CASE
2717                            WHEN INFO NOT IN ('', '.')
2718                            THEN INFO
2719                            ELSE ''
2720                        END ||
2721                        (
2722                        SELECT 
2723                            CASE 
2724                                WHEN table_variants.INFO NOT IN ('','.') 
2725                                    AND table_vcf.INFO NOT IN ('','.')  
2726                                THEN ';' 
2727                                ELSE '' 
2728                            END || 
2729                            CASE 
2730                                WHEN table_vcf.INFO NOT IN ('','.') 
2731                                THEN table_vcf.INFO 
2732                                ELSE '' 
2733                            END
2734                        FROM {table_vcf} as table_vcf
2735                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
2736                            AND table_vcf.\"POS\" = table_variants.\"POS\"
2737                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
2738                            AND table_vcf.\"REF\" = table_variants.\"REF\"
2739                        )
2740        """
2741        self.conn.execute(sql_query_update)
2742
2743        # Drop temporary table
2744        sql_drop = f"DROP TABLE {table_vcf}"
2745        self.conn.execute(sql_drop)

It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table

Parameters
  • vcf_file: The path to the VCF file you want to update the database with
def drop_variants_table(self) -> None:
2747    def drop_variants_table(self) -> None:
2748        """
2749        > This function drops the variants table
2750        """
2751
2752        table_variants = self.get_table_variants()
2753        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
2754        self.conn.execute(sql_table_variants)

This function drops the variants table

def set_variant_id(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2756    def set_variant_id(
2757        self, variant_id_column: str = "variant_id", force: bool = None
2758    ) -> str:
2759        """
2760        It adds a column to the variants table called `variant_id` and populates it with a hash of the
2761        `#CHROM`, `POS`, `REF`, and `ALT` columns
2762
2763        :param variant_id_column: The name of the column to be created in the variants table, defaults
2764        to variant_id
2765        :type variant_id_column: str (optional)
2766        :param force: If True, the variant_id column will be created even if it already exists
2767        :type force: bool
2768        :return: The name of the column that contains the variant_id
2769        """
2770
2771        # Assembly
2772        assembly = self.get_param().get(
2773            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
2774        )
2775
2776        # INFO/Tag prefix
2777        prefix = self.get_explode_infos_prefix()
2778
2779        # Explode INFO/SVTYPE
2780        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
2781
2782        # variants table
2783        table_variants = self.get_table_variants()
2784
2785        # variant_id column
2786        if not variant_id_column:
2787            variant_id_column = "variant_id"
2788
2789        # Creta variant_id column
2790        if "variant_id" not in self.get_extra_infos() or force:
2791
2792            # Create column
2793            self.add_column(
2794                table_name=table_variants,
2795                column_name=variant_id_column,
2796                column_type="UBIGINT",
2797                default_value="0",
2798            )
2799
2800            # Update column
2801            self.conn.execute(
2802                f"""
2803                    UPDATE {table_variants}
2804                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
2805                """
2806            )
2807
2808        # Remove added columns
2809        for added_column in added_columns:
2810            self.drop_column(column=added_column)
2811
2812        # return variant_id column name
2813        return variant_id_column

It adds a column to the variants table called variant_id and populates it with a hash of the #CHROM, POS, REF, and ALT columns

Parameters
  • variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
  • force: If True, the variant_id column will be created even if it already exists
Returns

The name of the column that contains the variant_id

def get_variant_id_column(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2815    def get_variant_id_column(
2816        self, variant_id_column: str = "variant_id", force: bool = None
2817    ) -> str:
2818        """
2819        This function returns the variant_id column name
2820
2821        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
2822        defaults to variant_id
2823        :type variant_id_column: str (optional)
2824        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
2825        False, will only set the variant_id if it is not already set. If None, will set the variant_id
2826        if it is not already set, or if it is set
2827        :type force: bool
2828        :return: The variant_id column name.
2829        """
2830
2831        return self.set_variant_id(variant_id_column=variant_id_column, force=force)

This function returns the variant_id column name

Parameters
  • variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
  • force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns

The variant_id column name.

def scan_databases( self, database_formats: list = ['parquet'], database_releases: list = ['current']) -> dict:
2837    def scan_databases(
2838        self,
2839        database_formats: list = ["parquet"],
2840        database_releases: list = ["current"],
2841    ) -> dict:
2842        """
2843        The function `scan_databases` scans for available databases based on specified formats and
2844        releases.
2845
2846        :param database_formats: The `database_formats` parameter is a list that specifies the formats
2847        of the databases to be scanned. In this case, the accepted format is "parquet"
2848        :type database_formats: list ["parquet"]
2849        :param database_releases: The `database_releases` parameter is a list that specifies the
2850        releases of the databases to be scanned. In the provided function, the default value for
2851        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
2852        databases that are in the "current"
2853        :type database_releases: list
2854        :return: The function `scan_databases` returns a dictionary containing information about
2855        databases that match the specified formats and releases.
2856        """
2857
2858        # Config
2859        config = self.get_config()
2860
2861        # Param
2862        param = self.get_param()
2863
2864        # Param - Assembly
2865        assembly = param.get("assembly", config.get("assembly", None))
2866        if not assembly:
2867            assembly = DEFAULT_ASSEMBLY
2868            log.warning(f"Default assembly '{assembly}'")
2869
2870        # Scan for availabled databases
2871        log.info(
2872            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
2873        )
2874        databases_infos_dict = databases_infos(
2875            database_folder_releases=database_releases,
2876            database_formats=database_formats,
2877            assembly=assembly,
2878            config=config,
2879        )
2880        log.info(
2881            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
2882        )
2883
2884        return databases_infos_dict

The function scan_databases scans for available databases based on specified formats and releases.

Parameters
  • database_formats: The database_formats parameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet"
  • database_releases: The database_releases parameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value for database_releases is set to ["current"], meaning that by default, the function will scan databases that are in the "current"
Returns

The function scan_databases returns a dictionary containing information about databases that match the specified formats and releases.

def annotation(self) -> None:
2886    def annotation(self) -> None:
2887        """
2888        It annotates the VCF file with the annotations specified in the config file.
2889        """
2890
2891        # Config
2892        config = self.get_config()
2893
2894        # Param
2895        param = self.get_param()
2896
2897        # Param - Assembly
2898        assembly = param.get("assembly", config.get("assembly", None))
2899        if not assembly:
2900            assembly = DEFAULT_ASSEMBLY
2901            log.warning(f"Default assembly '{assembly}'")
2902
2903        # annotations databases folders
2904        annotations_databases = set(
2905            config.get("folders", {})
2906            .get("databases", {})
2907            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
2908            + config.get("folders", {})
2909            .get("databases", {})
2910            .get("parquet", ["~/howard/databases/parquet/current"])
2911            + config.get("folders", {})
2912            .get("databases", {})
2913            .get("bcftools", ["~/howard/databases/bcftools/current"])
2914        )
2915
2916        # Get param annotations
2917        if param.get("annotations", None) and isinstance(
2918            param.get("annotations", None), str
2919        ):
2920            log.debug(param.get("annotations", None))
2921            param_annotation_list = param.get("annotations").split(",")
2922        else:
2923            param_annotation_list = []
2924
2925        # Each tools param
2926        if param.get("annotation_parquet", None) != None:
2927            log.debug(
2928                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
2929            )
2930            if isinstance(param.get("annotation_parquet", None), list):
2931                param_annotation_list.append(",".join(param.get("annotation_parquet")))
2932            else:
2933                param_annotation_list.append(param.get("annotation_parquet"))
2934        if param.get("annotation_snpsift", None) != None:
2935            if isinstance(param.get("annotation_snpsift", None), list):
2936                param_annotation_list.append(
2937                    "snpsift:"
2938                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
2939                )
2940            else:
2941                param_annotation_list.append(
2942                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
2943                )
2944        if param.get("annotation_snpeff", None) != None:
2945            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
2946        if param.get("annotation_bcftools", None) != None:
2947            if isinstance(param.get("annotation_bcftools", None), list):
2948                param_annotation_list.append(
2949                    "bcftools:"
2950                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
2951                )
2952            else:
2953                param_annotation_list.append(
2954                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
2955                )
2956        if param.get("annotation_annovar", None) != None:
2957            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
2958        if param.get("annotation_exomiser", None) != None:
2959            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
2960        if param.get("annotation_splice", None) != None:
2961            param_annotation_list.append("splice:" + param.get("annotation_splice"))
2962
2963        # Merge param annotations list
2964        param["annotations"] = ",".join(param_annotation_list)
2965
2966        # debug
2967        log.debug(f"param_annotations={param['annotations']}")
2968
2969        if param.get("annotations"):
2970
2971            # Log
2972            # log.info("Annotations - Check annotation parameters")
2973
2974            if not "annotation" in param:
2975                param["annotation"] = {}
2976
2977            # List of annotations parameters
2978            annotations_list_input = {}
2979            if isinstance(param.get("annotations", None), str):
2980                annotation_file_list = [
2981                    value for value in param.get("annotations", "").split(",")
2982                ]
2983                for annotation_file in annotation_file_list:
2984                    annotations_list_input[annotation_file.strip()] = {"INFO": None}
2985            else:
2986                annotations_list_input = param.get("annotations", {})
2987
2988            log.info(f"Quick Annotations:")
2989            for annotation_key in list(annotations_list_input.keys()):
2990                log.info(f"   {annotation_key}")
2991
2992            # List of annotations and associated fields
2993            annotations_list = {}
2994
2995            for annotation_file in annotations_list_input:
2996
2997                # Explode annotations if ALL
2998                if (
2999                    annotation_file.upper() == "ALL"
3000                    or annotation_file.upper().startswith("ALL:")
3001                ):
3002
3003                    # check ALL parameters (formats, releases)
3004                    annotation_file_split = annotation_file.split(":")
3005                    database_formats = "parquet"
3006                    database_releases = "current"
3007                    for annotation_file_option in annotation_file_split[1:]:
3008                        database_all_options_split = annotation_file_option.split("=")
3009                        if database_all_options_split[0] == "format":
3010                            database_formats = database_all_options_split[1].split("+")
3011                        if database_all_options_split[0] == "release":
3012                            database_releases = database_all_options_split[1].split("+")
3013
3014                    # Scan for availabled databases
3015                    databases_infos_dict = self.scan_databases(
3016                        database_formats=database_formats,
3017                        database_releases=database_releases,
3018                    )
3019
3020                    # Add found databases in annotation parameters
3021                    for database_infos in databases_infos_dict.keys():
3022                        annotations_list[database_infos] = {"INFO": None}
3023
3024                else:
3025                    annotations_list[annotation_file] = annotations_list_input[
3026                        annotation_file
3027                    ]
3028
3029            # Check each databases
3030            if len(annotations_list):
3031
3032                log.info(
3033                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
3034                )
3035
3036                for annotation_file in annotations_list:
3037
3038                    # Init
3039                    annotations = annotations_list.get(annotation_file, None)
3040
3041                    # Annotation snpEff
3042                    if annotation_file.startswith("snpeff"):
3043
3044                        log.debug(f"Quick Annotation snpEff")
3045
3046                        if "snpeff" not in param["annotation"]:
3047                            param["annotation"]["snpeff"] = {}
3048
3049                        if "options" not in param["annotation"]["snpeff"]:
3050                            param["annotation"]["snpeff"]["options"] = ""
3051
3052                        # snpEff options in annotations
3053                        param["annotation"]["snpeff"]["options"] = "".join(
3054                            annotation_file.split(":")[1:]
3055                        )
3056
3057                    # Annotation Annovar
3058                    elif annotation_file.startswith("annovar"):
3059
3060                        log.debug(f"Quick Annotation Annovar")
3061
3062                        if "annovar" not in param["annotation"]:
3063                            param["annotation"]["annovar"] = {}
3064
3065                        if "annotations" not in param["annotation"]["annovar"]:
3066                            param["annotation"]["annovar"]["annotations"] = {}
3067
3068                        # Options
3069                        annotation_file_split = annotation_file.split(":")
3070                        for annotation_file_annotation in annotation_file_split[1:]:
3071                            if annotation_file_annotation:
3072                                param["annotation"]["annovar"]["annotations"][
3073                                    annotation_file_annotation
3074                                ] = annotations
3075
3076                    # Annotation Exomiser
3077                    elif annotation_file.startswith("exomiser"):
3078
3079                        log.debug(f"Quick Annotation Exomiser")
3080
3081                        param["annotation"]["exomiser"] = params_string_to_dict(
3082                            annotation_file
3083                        )
3084
3085                    # Annotation Splice
3086                    elif annotation_file.startswith("splice"):
3087
3088                        log.debug(f"Quick Annotation Splice")
3089
3090                        param["annotation"]["splice"] = params_string_to_dict(
3091                            annotation_file
3092                        )
3093
3094                    # Annotation Parquet or BCFTOOLS
3095                    else:
3096
3097                        # Tools detection
3098                        if annotation_file.startswith("bcftools:"):
3099                            annotation_tool_initial = "bcftools"
3100                            annotation_file = ":".join(annotation_file.split(":")[1:])
3101                        elif annotation_file.startswith("snpsift:"):
3102                            annotation_tool_initial = "snpsift"
3103                            annotation_file = ":".join(annotation_file.split(":")[1:])
3104                        elif annotation_file.startswith("bigwig:"):
3105                            annotation_tool_initial = "bigwig"
3106                            annotation_file = ":".join(annotation_file.split(":")[1:])
3107                        else:
3108                            annotation_tool_initial = None
3109
3110                        # list of files
3111                        annotation_file_list = annotation_file.replace("+", ":").split(
3112                            ":"
3113                        )
3114
3115                        for annotation_file in annotation_file_list:
3116
3117                            if annotation_file:
3118
3119                                # Annotation tool initial
3120                                annotation_tool = annotation_tool_initial
3121
3122                                # Find file
3123                                annotation_file_found = None
3124
3125                                if os.path.exists(annotation_file):
3126                                    annotation_file_found = annotation_file
3127                                elif os.path.exists(full_path(annotation_file)):
3128                                    annotation_file_found = full_path(annotation_file)
3129                                else:
3130                                    # Find within assembly folders
3131                                    for annotations_database in annotations_databases:
3132                                        found_files = find_all(
3133                                            annotation_file,
3134                                            os.path.join(
3135                                                annotations_database, assembly
3136                                            ),
3137                                        )
3138                                        if len(found_files) > 0:
3139                                            annotation_file_found = found_files[0]
3140                                            break
3141                                    if not annotation_file_found and not assembly:
3142                                        # Find within folders
3143                                        for (
3144                                            annotations_database
3145                                        ) in annotations_databases:
3146                                            found_files = find_all(
3147                                                annotation_file, annotations_database
3148                                            )
3149                                            if len(found_files) > 0:
3150                                                annotation_file_found = found_files[0]
3151                                                break
3152                                log.debug(
3153                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
3154                                )
3155
3156                                # Full path
3157                                annotation_file_found = full_path(annotation_file_found)
3158
3159                                if annotation_file_found:
3160
3161                                    database = Database(database=annotation_file_found)
3162                                    quick_annotation_format = database.get_format()
3163                                    quick_annotation_is_compressed = (
3164                                        database.is_compressed()
3165                                    )
3166                                    quick_annotation_is_indexed = os.path.exists(
3167                                        f"{annotation_file_found}.tbi"
3168                                    )
3169                                    bcftools_preference = False
3170
3171                                    # Check Annotation Tool
3172                                    if not annotation_tool:
3173                                        if (
3174                                            bcftools_preference
3175                                            and quick_annotation_format
3176                                            in ["vcf", "bed"]
3177                                            and quick_annotation_is_compressed
3178                                            and quick_annotation_is_indexed
3179                                        ):
3180                                            annotation_tool = "bcftools"
3181                                        elif quick_annotation_format in [
3182                                            "vcf",
3183                                            "bed",
3184                                            "tsv",
3185                                            "tsv",
3186                                            "csv",
3187                                            "json",
3188                                            "tbl",
3189                                            "parquet",
3190                                            "duckdb",
3191                                        ]:
3192                                            annotation_tool = "parquet"
3193                                        elif quick_annotation_format in ["bw"]:
3194                                            annotation_tool = "bigwig"
3195                                        else:
3196                                            log.error(
3197                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3198                                            )
3199                                            raise ValueError(
3200                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3201                                            )
3202
3203                                    log.debug(
3204                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
3205                                    )
3206
3207                                    # Annotation Tool dispatch
3208                                    if annotation_tool:
3209                                        if annotation_tool not in param["annotation"]:
3210                                            param["annotation"][annotation_tool] = {}
3211                                        if (
3212                                            "annotations"
3213                                            not in param["annotation"][annotation_tool]
3214                                        ):
3215                                            param["annotation"][annotation_tool][
3216                                                "annotations"
3217                                            ] = {}
3218                                        param["annotation"][annotation_tool][
3219                                            "annotations"
3220                                        ][annotation_file_found] = annotations
3221
3222                                else:
3223                                    log.warning(
3224                                        f"Quick Annotation File {annotation_file} does NOT exist"
3225                                    )
3226
3227                self.set_param(param)
3228
3229        if param.get("annotation", None):
3230            log.info("Annotations")
3231            if param.get("annotation", {}).get("parquet", None):
3232                log.info("Annotations 'parquet'...")
3233                self.annotation_parquet()
3234            if param.get("annotation", {}).get("bcftools", None):
3235                log.info("Annotations 'bcftools'...")
3236                self.annotation_bcftools()
3237            if param.get("annotation", {}).get("snpsift", None):
3238                log.info("Annotations 'snpsift'...")
3239                self.annotation_snpsift()
3240            if param.get("annotation", {}).get("bigwig", None):
3241                log.info("Annotations 'bigwig'...")
3242                self.annotation_bigwig()
3243            if param.get("annotation", {}).get("annovar", None):
3244                log.info("Annotations 'annovar'...")
3245                self.annotation_annovar()
3246            if param.get("annotation", {}).get("snpeff", None):
3247                log.info("Annotations 'snpeff'...")
3248                self.annotation_snpeff()
3249            if param.get("annotation", {}).get("exomiser", None) is not None:
3250                log.info("Annotations 'exomiser'...")
3251                self.annotation_exomiser()
3252            if param.get("annotation", {}).get("splice", None) is not None:
3253                log.info("Annotations 'splice' ...")
3254                self.annotation_splice()
3255
3256        # Explode INFOS fields into table fields
3257        if self.get_explode_infos():
3258            self.explode_infos(
3259                prefix=self.get_explode_infos_prefix(),
3260                fields=self.get_explode_infos_fields(),
3261                force=True,
3262            )

It annotates the VCF file with the annotations specified in the config file.

def annotation_bigwig(self, threads: int = None) -> None:
3264    def annotation_bigwig(self, threads: int = None) -> None:
3265        """
3266        The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases.
3267
3268        :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the
3269        number of threads to be used for parallel processing during the annotation process. If the
3270        `threads` parameter is not provided, the method will attempt to determine the optimal number of
3271        threads to use based on the system configuration
3272        :type threads: int
3273        :return: True
3274        """
3275
3276        # DEBUG
3277        log.debug("Start annotation with bigwig databases")
3278
3279        # # Threads
3280        # if not threads:
3281        #     threads = self.get_threads()
3282        # log.debug("Threads: " + str(threads))
3283
3284        # Config
3285        config = self.get_config()
3286        log.debug("Config: " + str(config))
3287
3288        # Config - BCFTools databases folders
3289        databases_folders = set(
3290            self.get_config()
3291            .get("folders", {})
3292            .get("databases", {})
3293            .get("annotations", ["."])
3294            + self.get_config()
3295            .get("folders", {})
3296            .get("databases", {})
3297            .get("bigwig", ["."])
3298        )
3299        log.debug("Databases annotations: " + str(databases_folders))
3300
3301        # Param
3302        annotations = (
3303            self.get_param()
3304            .get("annotation", {})
3305            .get("bigwig", {})
3306            .get("annotations", None)
3307        )
3308        log.debug("Annotations: " + str(annotations))
3309
3310        # Assembly
3311        assembly = self.get_param().get(
3312            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3313        )
3314
3315        # Data
3316        table_variants = self.get_table_variants()
3317
3318        # Check if not empty
3319        log.debug("Check if not empty")
3320        sql_query_chromosomes = (
3321            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3322        )
3323        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3324        if not sql_query_chromosomes_df["count"][0]:
3325            log.info(f"VCF empty")
3326            return
3327
3328        # VCF header
3329        vcf_reader = self.get_header()
3330        log.debug("Initial header: " + str(vcf_reader.infos))
3331
3332        # Existing annotations
3333        for vcf_annotation in self.get_header().infos:
3334
3335            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3336            log.debug(
3337                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3338            )
3339
3340        if annotations:
3341
3342            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3343
3344                # Export VCF file
3345                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3346
3347                # annotation_bigwig_config
3348                annotation_bigwig_config_list = []
3349
3350                for annotation in annotations:
3351                    annotation_fields = annotations[annotation]
3352
3353                    # Annotation Name
3354                    annotation_name = os.path.basename(annotation)
3355
3356                    if not annotation_fields:
3357                        annotation_fields = {"INFO": None}
3358
3359                    log.debug(f"Annotation '{annotation_name}'")
3360                    log.debug(
3361                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3362                    )
3363
3364                    # Create Database
3365                    database = Database(
3366                        database=annotation,
3367                        databases_folders=databases_folders,
3368                        assembly=assembly,
3369                    )
3370
3371                    # Find files
3372                    db_file = database.get_database()
3373                    db_file = full_path(db_file)
3374                    db_hdr_file = database.get_header_file()
3375                    db_hdr_file = full_path(db_hdr_file)
3376                    db_file_type = database.get_format()
3377
3378                    # If db_file is http ?
3379                    if database.get_database().startswith("http"):
3380
3381                        # Datbase is HTTP URL
3382                        db_file_is_http = True
3383
3384                        # DB file keep as URL
3385                        db_file = database.get_database()
3386                        log.warning(
3387                            f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)"
3388                        )
3389
3390                        # Retrieve automatic annotation field name
3391                        annotation_field = clean_annotation_field(
3392                            os.path.basename(db_file).replace(".bw", "")
3393                        )
3394                        log.debug(
3395                            f"Create header file with annotation field '{annotation_field}' is an HTTP URL"
3396                        )
3397
3398                        # Create automatic header file
3399                        db_hdr_file = os.path.join(tmp_dir, "header.hdr")
3400                        with open(db_hdr_file, "w") as f:
3401                            f.write("##fileformat=VCFv4.2\n")
3402                            f.write(
3403                                f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n"""
3404                            )
3405                            f.write(f"#CHROM	START	END	{annotation_field}\n")
3406
3407                    else:
3408
3409                        # Datbase is NOT HTTP URL
3410                        db_file_is_http = False
3411
3412                    # Check index - try to create if not exists
3413                    if (
3414                        db_file is None
3415                        or db_hdr_file is None
3416                        or (not os.path.exists(db_file) and not db_file_is_http)
3417                        or not os.path.exists(db_hdr_file)
3418                        or not db_file_type in ["bw"]
3419                    ):
3420                        # if False:
3421                        log.error("Annotation failed: database not valid")
3422                        log.error(f"Annotation annotation file: {db_file}")
3423                        log.error(f"Annotation annotation file type: {db_file_type}")
3424                        log.error(f"Annotation annotation header: {db_hdr_file}")
3425                        raise ValueError(
3426                            f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}"
3427                        )
3428                    else:
3429
3430                        # Log
3431                        log.debug(
3432                            f"Annotation '{annotation}' - file: "
3433                            + str(db_file)
3434                            + " and "
3435                            + str(db_hdr_file)
3436                        )
3437
3438                        # Load header as VCF object
3439                        db_hdr_vcf = Variants(input=db_hdr_file)
3440                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3441                        log.debug(
3442                            "Annotation database header: "
3443                            + str(db_hdr_vcf_header_infos)
3444                        )
3445
3446                        # For all fields in database
3447                        annotation_fields_full = False
3448                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3449                            annotation_fields = {
3450                                key: key for key in db_hdr_vcf_header_infos
3451                            }
3452                            log.debug(
3453                                "Annotation database header - All annotations added: "
3454                                + str(annotation_fields)
3455                            )
3456                            annotation_fields_full = True
3457
3458                        # Init
3459                        cyvcf2_header_rename_dict = {}
3460                        cyvcf2_header_list = []
3461                        cyvcf2_header_indexes = {}
3462
3463                        # process annotation fields
3464                        for annotation_field in annotation_fields:
3465
3466                            # New annotation name
3467                            annotation_field_new = annotation_fields[annotation_field]
3468
3469                            # Check annotation field and index in header
3470                            if (
3471                                annotation_field
3472                                in db_hdr_vcf.get_header_columns_as_list()
3473                            ):
3474                                annotation_field_index = (
3475                                    db_hdr_vcf.get_header_columns_as_list().index(
3476                                        annotation_field
3477                                    )
3478                                    - 3
3479                                )
3480                                cyvcf2_header_indexes[annotation_field_new] = (
3481                                    annotation_field_index
3482                                )
3483                            else:
3484                                msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'"
3485                                log.error(msg_err)
3486                                raise ValueError(msg_err)
3487
3488                            # Append annotation field in cyvcf2 header list
3489                            cyvcf2_header_rename_dict[annotation_field_new] = (
3490                                db_hdr_vcf_header_infos[annotation_field].id
3491                            )
3492                            cyvcf2_header_list.append(
3493                                {
3494                                    "ID": annotation_field_new,
3495                                    "Number": db_hdr_vcf_header_infos[
3496                                        annotation_field
3497                                    ].num,
3498                                    "Type": db_hdr_vcf_header_infos[
3499                                        annotation_field
3500                                    ].type,
3501                                    "Description": db_hdr_vcf_header_infos[
3502                                        annotation_field
3503                                    ].desc,
3504                                }
3505                            )
3506
3507                            # Add header on VCF
3508                            vcf_reader.infos[annotation_field_new] = vcf.parser._Info(
3509                                annotation_field_new,
3510                                db_hdr_vcf_header_infos[annotation_field].num,
3511                                db_hdr_vcf_header_infos[annotation_field].type,
3512                                db_hdr_vcf_header_infos[annotation_field].desc,
3513                                "HOWARD BigWig annotation",
3514                                "unknown",
3515                                self.code_type_map[
3516                                    db_hdr_vcf_header_infos[annotation_field].type
3517                                ],
3518                            )
3519
3520                        # Load bigwig database
3521                        bw_db = pyBigWig.open(db_file)
3522                        if bw_db.isBigWig():
3523                            log.debug(f"Database '{db_file}' is in 'BigWig' format")
3524                        else:
3525                            msg_err = f"Database '{db_file}' is NOT in 'BigWig' format"
3526                            log.error(msg_err)
3527                            raise ValueError(msg_err)
3528
3529                        annotation_bigwig_config_list.append(
3530                            {
3531                                "db_file": db_file,
3532                                "bw_db": bw_db,
3533                                "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict,
3534                                "cyvcf2_header_list": cyvcf2_header_list,
3535                                "cyvcf2_header_indexes": cyvcf2_header_indexes,
3536                            }
3537                        )
3538
3539                # Annotate
3540                if annotation_bigwig_config_list:
3541
3542                    # Annotation config
3543                    log.debug(
3544                        f"annotation_bigwig_config={annotation_bigwig_config_list}"
3545                    )
3546
3547                    # Export VCF file
3548                    self.export_variant_vcf(
3549                        vcf_file=tmp_vcf_name,
3550                        remove_info=True,
3551                        add_samples=False,
3552                        index=True,
3553                    )
3554
3555                    # Load input tmp file
3556                    input_vcf = cyvcf2.VCF(tmp_vcf_name)
3557
3558                    # Add header in input file
3559                    for annotation_bigwig_config in annotation_bigwig_config_list:
3560                        for cyvcf2_header_field in annotation_bigwig_config.get(
3561                            "cyvcf2_header_list", []
3562                        ):
3563                            log.info(
3564                                f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'"
3565                            )
3566                            input_vcf.add_info_to_header(cyvcf2_header_field)
3567
3568                    # Create output VCF file
3569                    output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz")
3570                    output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf)
3571
3572                    # Fetch variants
3573                    log.info(f"Annotations 'bigwig' start...")
3574                    for variant in input_vcf:
3575
3576                        for annotation_bigwig_config in annotation_bigwig_config_list:
3577
3578                            # DB and indexes
3579                            bw_db = annotation_bigwig_config.get("bw_db", None)
3580                            cyvcf2_header_indexes = annotation_bigwig_config.get(
3581                                "cyvcf2_header_indexes", None
3582                            )
3583
3584                            # Retrieve value from chrom pos
3585                            res = bw_db.values(
3586                                variant.CHROM, variant.POS - 1, variant.POS
3587                            )
3588
3589                            # For each annotation fields (and indexes)
3590                            for cyvcf2_header_index in cyvcf2_header_indexes:
3591
3592                                # If value is NOT nNone
3593                                if not np.isnan(
3594                                    res[cyvcf2_header_indexes[cyvcf2_header_index]]
3595                                ):
3596                                    variant.INFO[cyvcf2_header_index] = res[
3597                                        cyvcf2_header_indexes[cyvcf2_header_index]
3598                                    ]
3599
3600                        # Add record in output file
3601                        output_vcf.write_record(variant)
3602
3603                    # Log
3604                    log.debug(f"Annotation done.")
3605
3606                    # Close and write file
3607                    log.info(f"Annotations 'bigwig' write...")
3608                    output_vcf.close()
3609                    log.debug(f"Write done.")
3610
3611                    # Update variants
3612                    log.info(f"Annotations 'bigwig' update...")
3613                    self.update_from_vcf(output_vcf_file)
3614                    log.debug(f"Update done.")
3615
3616        return True

The function annotation_bigwig annotates variants in a VCF file using bigwig databases.

Parameters
  • threads: The threads parameter in the annotation_bigwig method is used to specify the number of threads to be used for parallel processing during the annotation process. If the threads parameter is not provided, the method will attempt to determine the optimal number of threads to use based on the system configuration
Returns

True

def annotation_snpsift(self, threads: int = None) -> None:
3618    def annotation_snpsift(self, threads: int = None) -> None:
3619        """
3620        This function annotate with bcftools
3621
3622        :param threads: Number of threads to use
3623        :return: the value of the variable "return_value".
3624        """
3625
3626        # DEBUG
3627        log.debug("Start annotation with bcftools databases")
3628
3629        # Threads
3630        if not threads:
3631            threads = self.get_threads()
3632        log.debug("Threads: " + str(threads))
3633
3634        # Config
3635        config = self.get_config()
3636        log.debug("Config: " + str(config))
3637
3638        # Config - snpSift
3639        snpsift_bin_command = get_bin_command(
3640            bin="SnpSift.jar",
3641            tool="snpsift",
3642            bin_type="jar",
3643            config=config,
3644            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
3645        )
3646        if not snpsift_bin_command:
3647            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
3648            log.error(msg_err)
3649            raise ValueError(msg_err)
3650
3651        # Config - bcftools
3652        bcftools_bin_command = get_bin_command(
3653            bin="bcftools",
3654            tool="bcftools",
3655            bin_type="bin",
3656            config=config,
3657            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3658        )
3659        if not bcftools_bin_command:
3660            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3661            log.error(msg_err)
3662            raise ValueError(msg_err)
3663
3664        # Config - BCFTools databases folders
3665        databases_folders = set(
3666            self.get_config()
3667            .get("folders", {})
3668            .get("databases", {})
3669            .get("annotations", ["."])
3670            + self.get_config()
3671            .get("folders", {})
3672            .get("databases", {})
3673            .get("bcftools", ["."])
3674        )
3675        log.debug("Databases annotations: " + str(databases_folders))
3676
3677        # Param
3678        annotations = (
3679            self.get_param()
3680            .get("annotation", {})
3681            .get("snpsift", {})
3682            .get("annotations", None)
3683        )
3684        log.debug("Annotations: " + str(annotations))
3685
3686        # Assembly
3687        assembly = self.get_param().get(
3688            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3689        )
3690
3691        # Data
3692        table_variants = self.get_table_variants()
3693
3694        # Check if not empty
3695        log.debug("Check if not empty")
3696        sql_query_chromosomes = (
3697            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3698        )
3699        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3700        if not sql_query_chromosomes_df["count"][0]:
3701            log.info(f"VCF empty")
3702            return
3703
3704        # VCF header
3705        vcf_reader = self.get_header()
3706        log.debug("Initial header: " + str(vcf_reader.infos))
3707
3708        # Existing annotations
3709        for vcf_annotation in self.get_header().infos:
3710
3711            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3712            log.debug(
3713                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3714            )
3715
3716        if annotations:
3717
3718            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3719
3720                # Export VCF file
3721                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3722
3723                # Init
3724                commands = {}
3725
3726                for annotation in annotations:
3727                    annotation_fields = annotations[annotation]
3728
3729                    # Annotation Name
3730                    annotation_name = os.path.basename(annotation)
3731
3732                    if not annotation_fields:
3733                        annotation_fields = {"INFO": None}
3734
3735                    log.debug(f"Annotation '{annotation_name}'")
3736                    log.debug(
3737                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3738                    )
3739
3740                    # Create Database
3741                    database = Database(
3742                        database=annotation,
3743                        databases_folders=databases_folders,
3744                        assembly=assembly,
3745                    )
3746
3747                    # Find files
3748                    db_file = database.get_database()
3749                    db_file = full_path(db_file)
3750                    db_hdr_file = database.get_header_file()
3751                    db_hdr_file = full_path(db_hdr_file)
3752                    db_file_type = database.get_format()
3753                    db_tbi_file = f"{db_file}.tbi"
3754                    db_file_compressed = database.is_compressed()
3755
3756                    # Check if compressed
3757                    if not db_file_compressed:
3758                        log.error(
3759                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3760                        )
3761                        raise ValueError(
3762                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3763                        )
3764
3765                    # Check if indexed
3766                    if not os.path.exists(db_tbi_file):
3767                        log.error(
3768                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3769                        )
3770                        raise ValueError(
3771                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3772                        )
3773
3774                    # Check index - try to create if not exists
3775                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3776                        log.error("Annotation failed: database not valid")
3777                        log.error(f"Annotation annotation file: {db_file}")
3778                        log.error(f"Annotation annotation header: {db_hdr_file}")
3779                        log.error(f"Annotation annotation index: {db_tbi_file}")
3780                        raise ValueError(
3781                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3782                        )
3783                    else:
3784
3785                        log.debug(
3786                            f"Annotation '{annotation}' - file: "
3787                            + str(db_file)
3788                            + " and "
3789                            + str(db_hdr_file)
3790                        )
3791
3792                        # Load header as VCF object
3793                        db_hdr_vcf = Variants(input=db_hdr_file)
3794                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3795                        log.debug(
3796                            "Annotation database header: "
3797                            + str(db_hdr_vcf_header_infos)
3798                        )
3799
3800                        # For all fields in database
3801                        annotation_fields_full = False
3802                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3803                            annotation_fields = {
3804                                key: key for key in db_hdr_vcf_header_infos
3805                            }
3806                            log.debug(
3807                                "Annotation database header - All annotations added: "
3808                                + str(annotation_fields)
3809                            )
3810                            annotation_fields_full = True
3811
3812                        # # Create file for field rename
3813                        # log.debug("Create file for field rename")
3814                        # tmp_rename = NamedTemporaryFile(
3815                        #     prefix=self.get_prefix(),
3816                        #     dir=self.get_tmp_dir(),
3817                        #     suffix=".rename",
3818                        #     delete=False,
3819                        # )
3820                        # tmp_rename_name = tmp_rename.name
3821                        # tmp_files.append(tmp_rename_name)
3822
3823                        # Number of fields
3824                        nb_annotation_field = 0
3825                        annotation_list = []
3826                        annotation_infos_rename_list = []
3827
3828                        for annotation_field in annotation_fields:
3829
3830                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3831                            annotation_fields_new_name = annotation_fields.get(
3832                                annotation_field, annotation_field
3833                            )
3834                            if not annotation_fields_new_name:
3835                                annotation_fields_new_name = annotation_field
3836
3837                            # Check if field is in DB and if field is not elready in input data
3838                            if (
3839                                annotation_field in db_hdr_vcf.get_header().infos
3840                                and annotation_fields_new_name
3841                                not in self.get_header().infos
3842                            ):
3843
3844                                log.info(
3845                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3846                                )
3847
3848                                # BCFTools annotate param to rename fields
3849                                if annotation_field != annotation_fields_new_name:
3850                                    annotation_infos_rename_list.append(
3851                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3852                                    )
3853
3854                                # Add INFO field to header
3855                                db_hdr_vcf_header_infos_number = (
3856                                    db_hdr_vcf_header_infos[annotation_field].num or "."
3857                                )
3858                                db_hdr_vcf_header_infos_type = (
3859                                    db_hdr_vcf_header_infos[annotation_field].type
3860                                    or "String"
3861                                )
3862                                db_hdr_vcf_header_infos_description = (
3863                                    db_hdr_vcf_header_infos[annotation_field].desc
3864                                    or f"{annotation_field} description"
3865                                )
3866                                db_hdr_vcf_header_infos_source = (
3867                                    db_hdr_vcf_header_infos[annotation_field].source
3868                                    or "unknown"
3869                                )
3870                                db_hdr_vcf_header_infos_version = (
3871                                    db_hdr_vcf_header_infos[annotation_field].version
3872                                    or "unknown"
3873                                )
3874
3875                                vcf_reader.infos[annotation_fields_new_name] = (
3876                                    vcf.parser._Info(
3877                                        annotation_fields_new_name,
3878                                        db_hdr_vcf_header_infos_number,
3879                                        db_hdr_vcf_header_infos_type,
3880                                        db_hdr_vcf_header_infos_description,
3881                                        db_hdr_vcf_header_infos_source,
3882                                        db_hdr_vcf_header_infos_version,
3883                                        self.code_type_map[
3884                                            db_hdr_vcf_header_infos_type
3885                                        ],
3886                                    )
3887                                )
3888
3889                                annotation_list.append(annotation_field)
3890
3891                                nb_annotation_field += 1
3892
3893                            else:
3894
3895                                if (
3896                                    annotation_field
3897                                    not in db_hdr_vcf.get_header().infos
3898                                ):
3899                                    log.warning(
3900                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
3901                                    )
3902                                if (
3903                                    annotation_fields_new_name
3904                                    in self.get_header().infos
3905                                ):
3906                                    log.warning(
3907                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
3908                                    )
3909
3910                        log.info(
3911                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3912                        )
3913
3914                        annotation_infos = ",".join(annotation_list)
3915
3916                        if annotation_infos != "":
3917
3918                            # Annotated VCF (and error file)
3919                            tmp_annotation_vcf_name = os.path.join(
3920                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
3921                            )
3922                            tmp_annotation_vcf_name_err = (
3923                                tmp_annotation_vcf_name + ".err"
3924                            )
3925
3926                            # Add fields to annotate
3927                            if not annotation_fields_full:
3928                                annotation_infos_option = f"-info {annotation_infos}"
3929                            else:
3930                                annotation_infos_option = ""
3931
3932                            # Info fields rename
3933                            if annotation_infos_rename_list:
3934                                annotation_infos_rename = " -c " + ",".join(
3935                                    annotation_infos_rename_list
3936                                )
3937                            else:
3938                                annotation_infos_rename = ""
3939
3940                            # Annotate command
3941                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3942
3943                            # Add command
3944                            commands[command_annotate] = tmp_annotation_vcf_name
3945
3946                if commands:
3947
3948                    # Export VCF file
3949                    self.export_variant_vcf(
3950                        vcf_file=tmp_vcf_name,
3951                        remove_info=True,
3952                        add_samples=False,
3953                        index=True,
3954                    )
3955                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
3956
3957                    # Num command
3958                    nb_command = 0
3959
3960                    # Annotate
3961                    for command_annotate in commands:
3962                        nb_command += 1
3963                        log.info(
3964                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
3965                        )
3966                        log.debug(f"command_annotate={command_annotate}")
3967                        run_parallel_commands([command_annotate], threads)
3968
3969                        # Debug
3970                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
3971
3972                        # Update variants
3973                        log.info(
3974                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
3975                        )
3976                        self.update_from_vcf(commands[command_annotate])

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_bcftools(self, threads: int = None) -> None:
3978    def annotation_bcftools(self, threads: int = None) -> None:
3979        """
3980        This function annotate with bcftools
3981
3982        :param threads: Number of threads to use
3983        :return: the value of the variable "return_value".
3984        """
3985
3986        # DEBUG
3987        log.debug("Start annotation with bcftools databases")
3988
3989        # Threads
3990        if not threads:
3991            threads = self.get_threads()
3992        log.debug("Threads: " + str(threads))
3993
3994        # Config
3995        config = self.get_config()
3996        log.debug("Config: " + str(config))
3997
3998        # DEBUG
3999        delete_tmp = True
4000        if self.get_config().get("verbosity", "warning") in ["debug"]:
4001            delete_tmp = False
4002            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4003
4004        # Config - BCFTools bin command
4005        bcftools_bin_command = get_bin_command(
4006            bin="bcftools",
4007            tool="bcftools",
4008            bin_type="bin",
4009            config=config,
4010            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
4011        )
4012        if not bcftools_bin_command:
4013            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
4014            log.error(msg_err)
4015            raise ValueError(msg_err)
4016
4017        # Config - BCFTools databases folders
4018        databases_folders = set(
4019            self.get_config()
4020            .get("folders", {})
4021            .get("databases", {})
4022            .get("annotations", ["."])
4023            + self.get_config()
4024            .get("folders", {})
4025            .get("databases", {})
4026            .get("bcftools", ["."])
4027        )
4028        log.debug("Databases annotations: " + str(databases_folders))
4029
4030        # Param
4031        annotations = (
4032            self.get_param()
4033            .get("annotation", {})
4034            .get("bcftools", {})
4035            .get("annotations", None)
4036        )
4037        log.debug("Annotations: " + str(annotations))
4038
4039        # Assembly
4040        assembly = self.get_param().get(
4041            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
4042        )
4043
4044        # Data
4045        table_variants = self.get_table_variants()
4046
4047        # Check if not empty
4048        log.debug("Check if not empty")
4049        sql_query_chromosomes = (
4050            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4051        )
4052        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
4053        if not sql_query_chromosomes_df["count"][0]:
4054            log.info(f"VCF empty")
4055            return
4056
4057        # Export in VCF
4058        log.debug("Create initial file to annotate")
4059        tmp_vcf = NamedTemporaryFile(
4060            prefix=self.get_prefix(),
4061            dir=self.get_tmp_dir(),
4062            suffix=".vcf.gz",
4063            delete=False,
4064        )
4065        tmp_vcf_name = tmp_vcf.name
4066
4067        # VCF header
4068        vcf_reader = self.get_header()
4069        log.debug("Initial header: " + str(vcf_reader.infos))
4070
4071        # Existing annotations
4072        for vcf_annotation in self.get_header().infos:
4073
4074            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
4075            log.debug(
4076                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
4077            )
4078
4079        if annotations:
4080
4081            tmp_ann_vcf_list = []
4082            commands = []
4083            tmp_files = []
4084            err_files = []
4085
4086            for annotation in annotations:
4087                annotation_fields = annotations[annotation]
4088
4089                # Annotation Name
4090                annotation_name = os.path.basename(annotation)
4091
4092                if not annotation_fields:
4093                    annotation_fields = {"INFO": None}
4094
4095                log.debug(f"Annotation '{annotation_name}'")
4096                log.debug(
4097                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
4098                )
4099
4100                # Create Database
4101                database = Database(
4102                    database=annotation,
4103                    databases_folders=databases_folders,
4104                    assembly=assembly,
4105                )
4106
4107                # Find files
4108                db_file = database.get_database()
4109                db_file = full_path(db_file)
4110                db_hdr_file = database.get_header_file()
4111                db_hdr_file = full_path(db_hdr_file)
4112                db_file_type = database.get_format()
4113                db_tbi_file = f"{db_file}.tbi"
4114                db_file_compressed = database.is_compressed()
4115
4116                # Check if compressed
4117                if not db_file_compressed:
4118                    log.error(
4119                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
4120                    )
4121                    raise ValueError(
4122                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
4123                    )
4124
4125                # Check if indexed
4126                if not os.path.exists(db_tbi_file):
4127                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
4128                    raise ValueError(
4129                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
4130                    )
4131
4132                # Check index - try to create if not exists
4133                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
4134                    log.error("Annotation failed: database not valid")
4135                    log.error(f"Annotation annotation file: {db_file}")
4136                    log.error(f"Annotation annotation header: {db_hdr_file}")
4137                    log.error(f"Annotation annotation index: {db_tbi_file}")
4138                    raise ValueError(
4139                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
4140                    )
4141                else:
4142
4143                    log.debug(
4144                        f"Annotation '{annotation}' - file: "
4145                        + str(db_file)
4146                        + " and "
4147                        + str(db_hdr_file)
4148                    )
4149
4150                    # Load header as VCF object
4151                    db_hdr_vcf = Variants(input=db_hdr_file)
4152                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
4153                    log.debug(
4154                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
4155                    )
4156
4157                    # For all fields in database
4158                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
4159                        annotation_fields = {
4160                            key: key for key in db_hdr_vcf_header_infos
4161                        }
4162                        log.debug(
4163                            "Annotation database header - All annotations added: "
4164                            + str(annotation_fields)
4165                        )
4166
4167                    # Number of fields
4168                    nb_annotation_field = 0
4169                    annotation_list = []
4170
4171                    for annotation_field in annotation_fields:
4172
4173                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
4174                        annotation_fields_new_name = annotation_fields.get(
4175                            annotation_field, annotation_field
4176                        )
4177                        if not annotation_fields_new_name:
4178                            annotation_fields_new_name = annotation_field
4179
4180                        # Check if field is in DB and if field is not elready in input data
4181                        if (
4182                            annotation_field in db_hdr_vcf.get_header().infos
4183                            and annotation_fields_new_name
4184                            not in self.get_header().infos
4185                        ):
4186
4187                            log.info(
4188                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
4189                            )
4190
4191                            # Add INFO field to header
4192                            db_hdr_vcf_header_infos_number = (
4193                                db_hdr_vcf_header_infos[annotation_field].num or "."
4194                            )
4195                            db_hdr_vcf_header_infos_type = (
4196                                db_hdr_vcf_header_infos[annotation_field].type
4197                                or "String"
4198                            )
4199                            db_hdr_vcf_header_infos_description = (
4200                                db_hdr_vcf_header_infos[annotation_field].desc
4201                                or f"{annotation_field} description"
4202                            )
4203                            db_hdr_vcf_header_infos_source = (
4204                                db_hdr_vcf_header_infos[annotation_field].source
4205                                or "unknown"
4206                            )
4207                            db_hdr_vcf_header_infos_version = (
4208                                db_hdr_vcf_header_infos[annotation_field].version
4209                                or "unknown"
4210                            )
4211
4212                            vcf_reader.infos[annotation_fields_new_name] = (
4213                                vcf.parser._Info(
4214                                    annotation_fields_new_name,
4215                                    db_hdr_vcf_header_infos_number,
4216                                    db_hdr_vcf_header_infos_type,
4217                                    db_hdr_vcf_header_infos_description,
4218                                    db_hdr_vcf_header_infos_source,
4219                                    db_hdr_vcf_header_infos_version,
4220                                    self.code_type_map[db_hdr_vcf_header_infos_type],
4221                                )
4222                            )
4223
4224                            # annotation_list.append(annotation_field)
4225                            if annotation_field != annotation_fields_new_name:
4226                                annotation_list.append(
4227                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
4228                                )
4229                            else:
4230                                annotation_list.append(annotation_field)
4231
4232                            nb_annotation_field += 1
4233
4234                        else:
4235
4236                            if annotation_field not in db_hdr_vcf.get_header().infos:
4237                                log.warning(
4238                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
4239                                )
4240                            if annotation_fields_new_name in self.get_header().infos:
4241                                log.warning(
4242                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
4243                                )
4244
4245                    log.info(
4246                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
4247                    )
4248
4249                    annotation_infos = ",".join(annotation_list)
4250
4251                    if annotation_infos != "":
4252
4253                        # Protect header for bcftools (remove "#CHROM" and variants line)
4254                        log.debug("Protect Header file - remove #CHROM line if exists")
4255                        tmp_header_vcf = NamedTemporaryFile(
4256                            prefix=self.get_prefix(),
4257                            dir=self.get_tmp_dir(),
4258                            suffix=".hdr",
4259                            delete=False,
4260                        )
4261                        tmp_header_vcf_name = tmp_header_vcf.name
4262                        tmp_files.append(tmp_header_vcf_name)
4263                        # Command
4264                        if db_hdr_file.endswith(".gz"):
4265                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
4266                        else:
4267                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
4268                        # Run
4269                        run_parallel_commands([command_extract_header], 1)
4270
4271                        # Find chomosomes
4272                        log.debug("Find chromosomes ")
4273                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
4274                        sql_query_chromosomes_df = self.get_query_to_df(
4275                            sql_query_chromosomes
4276                        )
4277                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
4278
4279                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
4280
4281                        # BED columns in the annotation file
4282                        if db_file_type in ["bed"]:
4283                            annotation_infos = "CHROM,POS,POS," + annotation_infos
4284
4285                        for chrom in chomosomes_list:
4286
4287                            # Create BED on initial VCF
4288                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
4289                            tmp_bed = NamedTemporaryFile(
4290                                prefix=self.get_prefix(),
4291                                dir=self.get_tmp_dir(),
4292                                suffix=".bed",
4293                                delete=False,
4294                            )
4295                            tmp_bed_name = tmp_bed.name
4296                            tmp_files.append(tmp_bed_name)
4297
4298                            # Detecte regions
4299                            log.debug(
4300                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
4301                            )
4302                            window = 1000000
4303                            sql_query_intervals_for_bed = f"""
4304                                SELECT  \"#CHROM\",
4305                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
4306                                        \"POS\"+{window}
4307                                FROM {table_variants} as table_variants
4308                                WHERE table_variants.\"#CHROM\" = '{chrom}'
4309                            """
4310                            regions = self.conn.execute(
4311                                sql_query_intervals_for_bed
4312                            ).fetchall()
4313                            merged_regions = merge_regions(regions)
4314                            log.debug(
4315                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
4316                            )
4317
4318                            header = ["#CHROM", "START", "END"]
4319                            with open(tmp_bed_name, "w") as f:
4320                                # Write the header with tab delimiter
4321                                f.write("\t".join(header) + "\n")
4322                                for d in merged_regions:
4323                                    # Write each data row with tab delimiter
4324                                    f.write("\t".join(map(str, d)) + "\n")
4325
4326                            # Tmp files
4327                            tmp_annotation_vcf = NamedTemporaryFile(
4328                                prefix=self.get_prefix(),
4329                                dir=self.get_tmp_dir(),
4330                                suffix=".vcf.gz",
4331                                delete=False,
4332                            )
4333                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
4334                            tmp_files.append(tmp_annotation_vcf_name)
4335                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
4336                            tmp_annotation_vcf_name_err = (
4337                                tmp_annotation_vcf_name + ".err"
4338                            )
4339                            err_files.append(tmp_annotation_vcf_name_err)
4340
4341                            # Annotate Command
4342                            log.debug(
4343                                f"Annotation '{annotation}' - add bcftools command"
4344                            )
4345
4346                            # Command
4347                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
4348
4349                            # Add command
4350                            commands.append(command_annotate)
4351
4352            # if some commands
4353            if commands:
4354
4355                # Export VCF file
4356                self.export_variant_vcf(
4357                    vcf_file=tmp_vcf_name,
4358                    remove_info=True,
4359                    add_samples=False,
4360                    index=True,
4361                )
4362
4363                # Threads
4364                # calculate threads for annotated commands
4365                if commands:
4366                    threads_bcftools_annotate = round(threads / len(commands))
4367                else:
4368                    threads_bcftools_annotate = 1
4369
4370                if not threads_bcftools_annotate:
4371                    threads_bcftools_annotate = 1
4372
4373                # Add threads option to bcftools commands
4374                if threads_bcftools_annotate > 1:
4375                    commands_threaded = []
4376                    for command in commands:
4377                        commands_threaded.append(
4378                            command.replace(
4379                                f"{bcftools_bin_command} annotate ",
4380                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
4381                            )
4382                        )
4383                    commands = commands_threaded
4384
4385                # Command annotation multithreading
4386                log.debug(f"Annotation - Annotation commands: " + str(commands))
4387                log.info(
4388                    f"Annotation - Annotation multithreaded in "
4389                    + str(len(commands))
4390                    + " commands"
4391                )
4392
4393                run_parallel_commands(commands, threads)
4394
4395                # Merge
4396                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
4397
4398                if tmp_ann_vcf_list_cmd:
4399
4400                    # Tmp file
4401                    tmp_annotate_vcf = NamedTemporaryFile(
4402                        prefix=self.get_prefix(),
4403                        dir=self.get_tmp_dir(),
4404                        suffix=".vcf.gz",
4405                        delete=True,
4406                    )
4407                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
4408                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
4409                    err_files.append(tmp_annotate_vcf_name_err)
4410
4411                    # Tmp file remove command
4412                    tmp_files_remove_command = ""
4413                    if tmp_files:
4414                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
4415
4416                    # Command merge
4417                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
4418                    log.info(
4419                        f"Annotation - Annotation merging "
4420                        + str(len(commands))
4421                        + " annotated files"
4422                    )
4423                    log.debug(f"Annotation - merge command: {merge_command}")
4424                    run_parallel_commands([merge_command], 1)
4425
4426                    # Error messages
4427                    log.info(f"Error/Warning messages:")
4428                    error_message_command_all = []
4429                    error_message_command_warning = []
4430                    error_message_command_err = []
4431                    for err_file in err_files:
4432                        with open(err_file, "r") as f:
4433                            for line in f:
4434                                message = line.strip()
4435                                error_message_command_all.append(message)
4436                                if line.startswith("[W::"):
4437                                    error_message_command_warning.append(message)
4438                                if line.startswith("[E::"):
4439                                    error_message_command_err.append(
4440                                        f"{err_file}: " + message
4441                                    )
4442                    # log info
4443                    for message in list(
4444                        set(error_message_command_err + error_message_command_warning)
4445                    ):
4446                        log.info(f"   {message}")
4447                    # debug info
4448                    for message in list(set(error_message_command_all)):
4449                        log.debug(f"   {message}")
4450                    # failed
4451                    if len(error_message_command_err):
4452                        log.error("Annotation failed: Error in commands")
4453                        raise ValueError("Annotation failed: Error in commands")
4454
4455                    # Update variants
4456                    log.info(f"Annotation - Updating...")
4457                    self.update_from_vcf(tmp_annotate_vcf_name)

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_exomiser(self, threads: int = None) -> None:
4459    def annotation_exomiser(self, threads: int = None) -> None:
4460        """
4461        This function annotate with Exomiser
4462
4463        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
4464        - "analysis" (dict/file):
4465            Full analysis dictionnary parameters (see Exomiser docs).
4466            Either a dict, or a file in JSON or YAML format.
4467            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
4468            Default : None
4469        - "preset" (string):
4470            Analysis preset (available in config folder).
4471            Used if no full "analysis" is provided.
4472            Default: "exome"
4473        - "phenopacket" (dict/file):
4474            Samples and phenotipic features parameters (see Exomiser docs).
4475            Either a dict, or a file in JSON or YAML format.
4476            Default: None
4477        - "subject" (dict):
4478            Sample parameters (see Exomiser docs).
4479            Example:
4480                "subject":
4481                    {
4482                        "id": "ISDBM322017",
4483                        "sex": "FEMALE"
4484                    }
4485            Default: None
4486        - "sample" (string):
4487            Sample name to construct "subject" section:
4488                "subject":
4489                    {
4490                        "id": "<sample>",
4491                        "sex": "UNKNOWN_SEX"
4492                    }
4493            Default: None
4494        - "phenotypicFeatures" (dict)
4495            Phenotypic features to construct "subject" section.
4496            Example:
4497                "phenotypicFeatures":
4498                    [
4499                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
4500                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
4501                    ]
4502        - "hpo" (list)
4503            List of HPO ids as phenotypic features.
4504            Example:
4505                "hpo": ['0001156', '0001363', '0011304', '0010055']
4506            Default: []
4507        - "outputOptions" (dict):
4508            Output options (see Exomiser docs).
4509            Default:
4510                "output_options" =
4511                    {
4512                        "outputContributingVariantsOnly": False,
4513                        "numGenes": 0,
4514                        "outputFormats": ["TSV_VARIANT", "VCF"]
4515                    }
4516        - "transcript_source" (string):
4517            Transcript source (either "refseq", "ucsc", "ensembl")
4518            Default: "refseq"
4519        - "exomiser_to_info" (boolean):
4520            Add exomiser TSV file columns as INFO fields in VCF.
4521            Default: False
4522        - "release" (string):
4523            Exomise database release.
4524            If not exists, database release will be downloaded (take a while).
4525            Default: None (provided by application.properties configuration file)
4526        - "exomiser_application_properties" (file):
4527            Exomiser configuration file (see Exomiser docs).
4528            Useful to automatically download databases (especially for specific genome databases).
4529
4530        Notes:
4531        - If no sample in parameters, first sample in VCF will be chosen
4532        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
4533
4534        :param threads: The number of threads to use
4535        :return: None.
4536        """
4537
4538        # DEBUG
4539        log.debug("Start annotation with Exomiser databases")
4540
4541        # Threads
4542        if not threads:
4543            threads = self.get_threads()
4544        log.debug("Threads: " + str(threads))
4545
4546        # Config
4547        config = self.get_config()
4548        log.debug("Config: " + str(config))
4549
4550        # Config - Folders - Databases
4551        databases_folders = (
4552            config.get("folders", {})
4553            .get("databases", {})
4554            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
4555        )
4556        databases_folders = full_path(databases_folders)
4557        if not os.path.exists(databases_folders):
4558            log.error(f"Databases annotations: {databases_folders} NOT found")
4559        log.debug("Databases annotations: " + str(databases_folders))
4560
4561        # Config - Exomiser
4562        exomiser_bin_command = get_bin_command(
4563            bin="exomiser-cli*.jar",
4564            tool="exomiser",
4565            bin_type="jar",
4566            config=config,
4567            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
4568        )
4569        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
4570        if not exomiser_bin_command:
4571            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
4572            log.error(msg_err)
4573            raise ValueError(msg_err)
4574
4575        # Param
4576        param = self.get_param()
4577        log.debug("Param: " + str(param))
4578
4579        # Param - Exomiser
4580        param_exomiser = param.get("annotation", {}).get("exomiser", {})
4581        log.debug(f"Param Exomiser: {param_exomiser}")
4582
4583        # Param - Assembly
4584        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4585        log.debug("Assembly: " + str(assembly))
4586
4587        # Data
4588        table_variants = self.get_table_variants()
4589
4590        # Check if not empty
4591        log.debug("Check if not empty")
4592        sql_query_chromosomes = (
4593            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4594        )
4595        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4596            log.info(f"VCF empty")
4597            return False
4598
4599        # VCF header
4600        vcf_reader = self.get_header()
4601        log.debug("Initial header: " + str(vcf_reader.infos))
4602
4603        # Samples
4604        samples = self.get_header_sample_list()
4605        if not samples:
4606            log.error("No Samples in VCF")
4607            return False
4608        log.debug(f"Samples: {samples}")
4609
4610        # Memory limit
4611        memory_limit = self.get_memory("8G")
4612        log.debug(f"memory_limit: {memory_limit}")
4613
4614        # Exomiser java options
4615        exomiser_java_options = (
4616            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4617        )
4618        log.debug(f"Exomiser java options: {exomiser_java_options}")
4619
4620        # Download Exomiser (if not exists)
4621        exomiser_release = param_exomiser.get("release", None)
4622        exomiser_application_properties = param_exomiser.get(
4623            "exomiser_application_properties", None
4624        )
4625        databases_download_exomiser(
4626            assemblies=[assembly],
4627            exomiser_folder=databases_folders,
4628            exomiser_release=exomiser_release,
4629            exomiser_phenotype_release=exomiser_release,
4630            exomiser_application_properties=exomiser_application_properties,
4631        )
4632
4633        # Force annotation
4634        force_update_annotation = True
4635
4636        if "Exomiser" not in self.get_header().infos or force_update_annotation:
4637            log.debug("Start annotation Exomiser")
4638
4639            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
4640
4641                # tmp_dir = "/tmp/exomiser"
4642
4643                ### ANALYSIS ###
4644                ################
4645
4646                # Create analysis.json through analysis dict
4647                # either analysis in param or by default
4648                # depending on preset exome/genome)
4649
4650                # Init analysis dict
4651                param_exomiser_analysis_dict = {}
4652
4653                # analysis from param
4654                param_exomiser_analysis = param_exomiser.get("analysis", {})
4655                param_exomiser_analysis = full_path(param_exomiser_analysis)
4656
4657                # If analysis in param -> load anlaysis json
4658                if param_exomiser_analysis:
4659
4660                    # If param analysis is a file and exists
4661                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
4662                        param_exomiser_analysis
4663                    ):
4664                        # Load analysis file into analysis dict (either yaml or json)
4665                        with open(param_exomiser_analysis) as json_file:
4666                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
4667
4668                    # If param analysis is a dict
4669                    elif isinstance(param_exomiser_analysis, dict):
4670                        # Load analysis dict into analysis dict (either yaml or json)
4671                        param_exomiser_analysis_dict = param_exomiser_analysis
4672
4673                    # Error analysis type
4674                    else:
4675                        log.error(f"Analysis type unknown. Check param file.")
4676                        raise ValueError(f"Analysis type unknown. Check param file.")
4677
4678                # Case no input analysis config file/dict
4679                # Use preset (exome/genome) to open default config file
4680                if not param_exomiser_analysis_dict:
4681
4682                    # default preset
4683                    default_preset = "exome"
4684
4685                    # Get param preset or default preset
4686                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
4687
4688                    # Try to find if preset is a file
4689                    if os.path.exists(param_exomiser_preset):
4690                        # Preset file is provided in full path
4691                        param_exomiser_analysis_default_config_file = (
4692                            param_exomiser_preset
4693                        )
4694                    # elif os.path.exists(full_path(param_exomiser_preset)):
4695                    #     # Preset file is provided in full path
4696                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
4697                    elif os.path.exists(
4698                        os.path.join(folder_config, param_exomiser_preset)
4699                    ):
4700                        # Preset file is provided a basename in config folder (can be a path with subfolders)
4701                        param_exomiser_analysis_default_config_file = os.path.join(
4702                            folder_config, param_exomiser_preset
4703                        )
4704                    else:
4705                        # Construct preset file
4706                        param_exomiser_analysis_default_config_file = os.path.join(
4707                            folder_config,
4708                            f"preset-{param_exomiser_preset}-analysis.json",
4709                        )
4710
4711                    # If preset file exists
4712                    param_exomiser_analysis_default_config_file = full_path(
4713                        param_exomiser_analysis_default_config_file
4714                    )
4715                    if os.path.exists(param_exomiser_analysis_default_config_file):
4716                        # Load prest file into analysis dict (either yaml or json)
4717                        with open(
4718                            param_exomiser_analysis_default_config_file
4719                        ) as json_file:
4720                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
4721                                json_file
4722                            )
4723
4724                    # Error preset file
4725                    else:
4726                        log.error(
4727                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4728                        )
4729                        raise ValueError(
4730                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4731                        )
4732
4733                # If no analysis dict created
4734                if not param_exomiser_analysis_dict:
4735                    log.error(f"No analysis config")
4736                    raise ValueError(f"No analysis config")
4737
4738                # Log
4739                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4740
4741                ### PHENOPACKET ###
4742                ###################
4743
4744                # If no PhenoPacket in analysis dict -> check in param
4745                if "phenopacket" not in param_exomiser_analysis_dict:
4746
4747                    # If PhenoPacket in param -> load anlaysis json
4748                    if param_exomiser.get("phenopacket", None):
4749
4750                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
4751                        param_exomiser_phenopacket = full_path(
4752                            param_exomiser_phenopacket
4753                        )
4754
4755                        # If param phenopacket is a file and exists
4756                        if isinstance(
4757                            param_exomiser_phenopacket, str
4758                        ) and os.path.exists(param_exomiser_phenopacket):
4759                            # Load phenopacket file into analysis dict (either yaml or json)
4760                            with open(param_exomiser_phenopacket) as json_file:
4761                                param_exomiser_analysis_dict["phenopacket"] = (
4762                                    yaml.safe_load(json_file)
4763                                )
4764
4765                        # If param phenopacket is a dict
4766                        elif isinstance(param_exomiser_phenopacket, dict):
4767                            # Load phenopacket dict into analysis dict (either yaml or json)
4768                            param_exomiser_analysis_dict["phenopacket"] = (
4769                                param_exomiser_phenopacket
4770                            )
4771
4772                        # Error phenopacket type
4773                        else:
4774                            log.error(f"Phenopacket type unknown. Check param file.")
4775                            raise ValueError(
4776                                f"Phenopacket type unknown. Check param file."
4777                            )
4778
4779                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
4780                if "phenopacket" not in param_exomiser_analysis_dict:
4781
4782                    # Init PhenoPacket
4783                    param_exomiser_analysis_dict["phenopacket"] = {
4784                        "id": "analysis",
4785                        "proband": {},
4786                    }
4787
4788                    ### Add subject ###
4789
4790                    # If subject exists
4791                    param_exomiser_subject = param_exomiser.get("subject", {})
4792
4793                    # If subject not exists -> found sample ID
4794                    if not param_exomiser_subject:
4795
4796                        # Found sample ID in param
4797                        sample = param_exomiser.get("sample", None)
4798
4799                        # Find sample ID (first sample)
4800                        if not sample:
4801                            sample_list = self.get_header_sample_list()
4802                            if len(sample_list) > 0:
4803                                sample = sample_list[0]
4804                            else:
4805                                log.error(f"No sample found")
4806                                raise ValueError(f"No sample found")
4807
4808                        # Create subject
4809                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
4810
4811                    # Add to dict
4812                    param_exomiser_analysis_dict["phenopacket"][
4813                        "subject"
4814                    ] = param_exomiser_subject
4815
4816                    ### Add "phenotypicFeatures" ###
4817
4818                    # If phenotypicFeatures exists
4819                    param_exomiser_phenotypicfeatures = param_exomiser.get(
4820                        "phenotypicFeatures", []
4821                    )
4822
4823                    # If phenotypicFeatures not exists -> Try to infer from hpo list
4824                    if not param_exomiser_phenotypicfeatures:
4825
4826                        # Found HPO in param
4827                        param_exomiser_hpo = param_exomiser.get("hpo", [])
4828
4829                        # Split HPO if list in string format separated by comma
4830                        if isinstance(param_exomiser_hpo, str):
4831                            param_exomiser_hpo = param_exomiser_hpo.split(",")
4832
4833                        # Create HPO list
4834                        for hpo in param_exomiser_hpo:
4835                            hpo_clean = re.sub("[^0-9]", "", hpo)
4836                            param_exomiser_phenotypicfeatures.append(
4837                                {
4838                                    "type": {
4839                                        "id": f"HP:{hpo_clean}",
4840                                        "label": f"HP:{hpo_clean}",
4841                                    }
4842                                }
4843                            )
4844
4845                    # Add to dict
4846                    param_exomiser_analysis_dict["phenopacket"][
4847                        "phenotypicFeatures"
4848                    ] = param_exomiser_phenotypicfeatures
4849
4850                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
4851                    if not param_exomiser_phenotypicfeatures:
4852                        for step in param_exomiser_analysis_dict.get(
4853                            "analysis", {}
4854                        ).get("steps", []):
4855                            if "hiPhivePrioritiser" in step:
4856                                param_exomiser_analysis_dict.get("analysis", {}).get(
4857                                    "steps", []
4858                                ).remove(step)
4859
4860                ### Add Input File ###
4861
4862                # Initial file name and htsFiles
4863                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
4864                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
4865                    {
4866                        "uri": tmp_vcf_name,
4867                        "htsFormat": "VCF",
4868                        "genomeAssembly": assembly,
4869                    }
4870                ]
4871
4872                ### Add metaData ###
4873
4874                # If metaData not in analysis dict
4875                if "metaData" not in param_exomiser_analysis_dict:
4876                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
4877                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
4878                        "createdBy": "howard",
4879                        "phenopacketSchemaVersion": 1,
4880                    }
4881
4882                ### OutputOptions ###
4883
4884                # Init output result folder
4885                output_results = os.path.join(tmp_dir, "results")
4886
4887                # If no outputOptions in analysis dict
4888                if "outputOptions" not in param_exomiser_analysis_dict:
4889
4890                    # default output formats
4891                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
4892
4893                    # Get outputOptions in param
4894                    output_options = param_exomiser.get("outputOptions", None)
4895
4896                    # If no output_options in param -> check
4897                    if not output_options:
4898                        output_options = {
4899                            "outputContributingVariantsOnly": False,
4900                            "numGenes": 0,
4901                            "outputFormats": defaut_output_formats,
4902                        }
4903
4904                    # Replace outputDirectory in output options
4905                    output_options["outputDirectory"] = output_results
4906                    output_options["outputFileName"] = "howard"
4907
4908                    # Add outputOptions in analysis dict
4909                    param_exomiser_analysis_dict["outputOptions"] = output_options
4910
4911                else:
4912
4913                    # Replace output_results and output format (if exists in param)
4914                    param_exomiser_analysis_dict["outputOptions"][
4915                        "outputDirectory"
4916                    ] = output_results
4917                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
4918                        list(
4919                            set(
4920                                param_exomiser_analysis_dict.get(
4921                                    "outputOptions", {}
4922                                ).get("outputFormats", [])
4923                                + ["TSV_VARIANT", "VCF"]
4924                            )
4925                        )
4926                    )
4927
4928                # log
4929                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4930
4931                ### ANALYSIS FILE ###
4932                #####################
4933
4934                ### Full JSON analysis config file ###
4935
4936                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
4937                with open(exomiser_analysis, "w") as fp:
4938                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
4939
4940                ### SPLIT analysis and sample config files
4941
4942                # Splitted analysis dict
4943                param_exomiser_analysis_dict_for_split = (
4944                    param_exomiser_analysis_dict.copy()
4945                )
4946
4947                # Phenopacket JSON file
4948                exomiser_analysis_phenopacket = os.path.join(
4949                    tmp_dir, "analysis_phenopacket.json"
4950                )
4951                with open(exomiser_analysis_phenopacket, "w") as fp:
4952                    json.dump(
4953                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
4954                        fp,
4955                        indent=4,
4956                    )
4957
4958                # Analysis JSON file without Phenopacket parameters
4959                param_exomiser_analysis_dict_for_split.pop("phenopacket")
4960                exomiser_analysis_analysis = os.path.join(
4961                    tmp_dir, "analysis_analysis.json"
4962                )
4963                with open(exomiser_analysis_analysis, "w") as fp:
4964                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
4965
4966                ### INITAL VCF file ###
4967                #######################
4968
4969                ### Create list of samples to use and include inti initial VCF file ####
4970
4971                # Subject (main sample)
4972                # Get sample ID in analysis dict
4973                sample_subject = (
4974                    param_exomiser_analysis_dict.get("phenopacket", {})
4975                    .get("subject", {})
4976                    .get("id", None)
4977                )
4978                sample_proband = (
4979                    param_exomiser_analysis_dict.get("phenopacket", {})
4980                    .get("proband", {})
4981                    .get("subject", {})
4982                    .get("id", None)
4983                )
4984                sample = []
4985                if sample_subject:
4986                    sample.append(sample_subject)
4987                if sample_proband:
4988                    sample.append(sample_proband)
4989
4990                # Get sample ID within Pedigree
4991                pedigree_persons_list = (
4992                    param_exomiser_analysis_dict.get("phenopacket", {})
4993                    .get("pedigree", {})
4994                    .get("persons", {})
4995                )
4996
4997                # Create list with all sample ID in pedigree (if exists)
4998                pedigree_persons = []
4999                for person in pedigree_persons_list:
5000                    pedigree_persons.append(person.get("individualId"))
5001
5002                # Concat subject sample ID and samples ID in pedigreesamples
5003                samples = list(set(sample + pedigree_persons))
5004
5005                # Check if sample list is not empty
5006                if not samples:
5007                    log.error(f"No samples found")
5008                    raise ValueError(f"No samples found")
5009
5010                # Create VCF with sample (either sample in param or first one by default)
5011                # Export VCF file
5012                self.export_variant_vcf(
5013                    vcf_file=tmp_vcf_name,
5014                    remove_info=True,
5015                    add_samples=True,
5016                    list_samples=samples,
5017                    index=False,
5018                )
5019
5020                ### Execute Exomiser ###
5021                ########################
5022
5023                # Init command
5024                exomiser_command = ""
5025
5026                # Command exomiser options
5027                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
5028
5029                # Release
5030                exomiser_release = param_exomiser.get("release", None)
5031                if exomiser_release:
5032                    # phenotype data version
5033                    exomiser_options += (
5034                        f" --exomiser.phenotype.data-version={exomiser_release} "
5035                    )
5036                    # data version
5037                    exomiser_options += (
5038                        f" --exomiser.{assembly}.data-version={exomiser_release} "
5039                    )
5040                    # variant white list
5041                    variant_white_list_file = (
5042                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
5043                    )
5044                    if os.path.exists(
5045                        os.path.join(
5046                            databases_folders, assembly, variant_white_list_file
5047                        )
5048                    ):
5049                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
5050
5051                # transcript_source
5052                transcript_source = param_exomiser.get(
5053                    "transcript_source", None
5054                )  # ucsc, refseq, ensembl
5055                if transcript_source:
5056                    exomiser_options += (
5057                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
5058                    )
5059
5060                # If analysis contain proband param
5061                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
5062                    "proband", {}
5063                ):
5064                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
5065
5066                # If no proband (usually uniq sample)
5067                else:
5068                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
5069
5070                # Log
5071                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
5072
5073                # Run command
5074                result = subprocess.call(
5075                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
5076                )
5077                if result:
5078                    log.error("Exomiser command failed")
5079                    raise ValueError("Exomiser command failed")
5080
5081                ### RESULTS ###
5082                ###############
5083
5084                ### Annotate with TSV fields ###
5085
5086                # Init result tsv file
5087                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
5088
5089                # Init result tsv file
5090                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
5091
5092                # Parse TSV file and explode columns in INFO field
5093                if exomiser_to_info and os.path.exists(output_results_tsv):
5094
5095                    # Log
5096                    log.debug("Exomiser columns to VCF INFO field")
5097
5098                    # Retrieve columns and types
5099                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
5100                    output_results_tsv_df = self.get_query_to_df(query)
5101                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
5102
5103                    # Init concat fields for update
5104                    sql_query_update_concat_fields = []
5105
5106                    # Fields to avoid
5107                    fields_to_avoid = [
5108                        "CONTIG",
5109                        "START",
5110                        "END",
5111                        "REF",
5112                        "ALT",
5113                        "QUAL",
5114                        "FILTER",
5115                        "GENOTYPE",
5116                    ]
5117
5118                    # List all columns to add into header
5119                    for header_column in output_results_tsv_columns:
5120
5121                        # If header column is enable
5122                        if header_column not in fields_to_avoid:
5123
5124                            # Header info type
5125                            header_info_type = "String"
5126                            header_column_df = output_results_tsv_df[header_column]
5127                            header_column_df_dtype = header_column_df.dtype
5128                            if header_column_df_dtype == object:
5129                                if (
5130                                    pd.to_numeric(header_column_df, errors="coerce")
5131                                    .notnull()
5132                                    .all()
5133                                ):
5134                                    header_info_type = "Float"
5135                            else:
5136                                header_info_type = "Integer"
5137
5138                            # Header info
5139                            characters_to_validate = ["-"]
5140                            pattern = "[" + "".join(characters_to_validate) + "]"
5141                            header_info_name = re.sub(
5142                                pattern,
5143                                "_",
5144                                f"Exomiser_{header_column}".replace("#", ""),
5145                            )
5146                            header_info_number = "."
5147                            header_info_description = (
5148                                f"Exomiser {header_column} annotation"
5149                            )
5150                            header_info_source = "Exomiser"
5151                            header_info_version = "unknown"
5152                            header_info_code = CODE_TYPE_MAP[header_info_type]
5153                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
5154                                header_info_name,
5155                                header_info_number,
5156                                header_info_type,
5157                                header_info_description,
5158                                header_info_source,
5159                                header_info_version,
5160                                header_info_code,
5161                            )
5162
5163                            # Add field to add for update to concat fields
5164                            sql_query_update_concat_fields.append(
5165                                f"""
5166                                CASE
5167                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
5168                                    THEN concat(
5169                                        '{header_info_name}=',
5170                                        table_parquet."{header_column}",
5171                                        ';'
5172                                        )
5173
5174                                    ELSE ''
5175                                END
5176                            """
5177                            )
5178
5179                    # Update query
5180                    sql_query_update = f"""
5181                        UPDATE {table_variants} as table_variants
5182                            SET INFO = concat(
5183                                            CASE
5184                                                WHEN INFO NOT IN ('', '.')
5185                                                THEN INFO
5186                                                ELSE ''
5187                                            END,
5188                                            CASE
5189                                                WHEN table_variants.INFO NOT IN ('','.')
5190                                                THEN ';'
5191                                                ELSE ''
5192                                            END,
5193                                            (
5194                                            SELECT 
5195                                                concat(
5196                                                    {",".join(sql_query_update_concat_fields)}
5197                                                )
5198                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
5199                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
5200                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
5201                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
5202                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
5203                                            )
5204                                        )
5205                            ;
5206                        """
5207
5208                    # Update
5209                    self.conn.execute(sql_query_update)
5210
5211                ### Annotate with VCF INFO field ###
5212
5213                # Init result VCF file
5214                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
5215
5216                # If VCF exists
5217                if os.path.exists(output_results_vcf):
5218
5219                    # Log
5220                    log.debug("Exomiser result VCF update variants")
5221
5222                    # Find Exomiser INFO field annotation in header
5223                    with gzip.open(output_results_vcf, "rt") as f:
5224                        header_list = self.read_vcf_header(f)
5225                    exomiser_vcf_header = vcf.Reader(
5226                        io.StringIO("\n".join(header_list))
5227                    )
5228
5229                    # Add annotation INFO field to header
5230                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
5231
5232                    # Update variants with VCF
5233                    self.update_from_vcf(output_results_vcf)
5234
5235        return True

This function annotate with Exomiser

This function uses args as parameters, in section "annotation" -> "exomiser", with sections:

  • "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
  • "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
  • "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
  • "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
  • "sample" (string): Sample name to construct "subject" section: "subject": { "id": "", "sex": "UNKNOWN_SEX" } Default: None
  • "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
  • "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
  • "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
  • "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
  • "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
  • "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
  • "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).

Notes:

  • If no sample in parameters, first sample in VCF will be chosen
  • If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
  • threads: The number of threads to use
Returns

None.

def annotation_snpeff(self, threads: int = None) -> None:
5237    def annotation_snpeff(self, threads: int = None) -> None:
5238        """
5239        This function annotate with snpEff
5240
5241        :param threads: The number of threads to use
5242        :return: the value of the variable "return_value".
5243        """
5244
5245        # DEBUG
5246        log.debug("Start annotation with snpeff databases")
5247
5248        # Threads
5249        if not threads:
5250            threads = self.get_threads()
5251        log.debug("Threads: " + str(threads))
5252
5253        # DEBUG
5254        delete_tmp = True
5255        if self.get_config().get("verbosity", "warning") in ["debug"]:
5256            delete_tmp = False
5257            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5258
5259        # Config
5260        config = self.get_config()
5261        log.debug("Config: " + str(config))
5262
5263        # Config - Folders - Databases
5264        databases_folders = (
5265            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
5266        )
5267        log.debug("Databases annotations: " + str(databases_folders))
5268
5269        # Config - snpEff bin command
5270        snpeff_bin_command = get_bin_command(
5271            bin="snpEff.jar",
5272            tool="snpeff",
5273            bin_type="jar",
5274            config=config,
5275            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
5276        )
5277        if not snpeff_bin_command:
5278            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
5279            log.error(msg_err)
5280            raise ValueError(msg_err)
5281
5282        # Config - snpEff databases
5283        snpeff_databases = (
5284            config.get("folders", {})
5285            .get("databases", {})
5286            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
5287        )
5288        snpeff_databases = full_path(snpeff_databases)
5289        if snpeff_databases is not None and snpeff_databases != "":
5290            log.debug(f"Create snpEff databases folder")
5291            if not os.path.exists(snpeff_databases):
5292                os.makedirs(snpeff_databases)
5293
5294        # Param
5295        param = self.get_param()
5296        log.debug("Param: " + str(param))
5297
5298        # Param
5299        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
5300        log.debug("Options: " + str(options))
5301
5302        # Param - Assembly
5303        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
5304
5305        # Param - Options
5306        snpeff_options = (
5307            param.get("annotation", {}).get("snpeff", {}).get("options", "")
5308        )
5309        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
5310        snpeff_csvstats = (
5311            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
5312        )
5313        if snpeff_stats:
5314            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
5315            snpeff_stats = full_path(snpeff_stats)
5316            snpeff_options += f" -stats {snpeff_stats}"
5317        if snpeff_csvstats:
5318            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
5319            snpeff_csvstats = full_path(snpeff_csvstats)
5320            snpeff_options += f" -csvStats {snpeff_csvstats}"
5321
5322        # Data
5323        table_variants = self.get_table_variants()
5324
5325        # Check if not empty
5326        log.debug("Check if not empty")
5327        sql_query_chromosomes = (
5328            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5329        )
5330        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
5331        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
5332            log.info(f"VCF empty")
5333            return
5334
5335        # Export in VCF
5336        log.debug("Create initial file to annotate")
5337        tmp_vcf = NamedTemporaryFile(
5338            prefix=self.get_prefix(),
5339            dir=self.get_tmp_dir(),
5340            suffix=".vcf.gz",
5341            delete=True,
5342        )
5343        tmp_vcf_name = tmp_vcf.name
5344
5345        # VCF header
5346        vcf_reader = self.get_header()
5347        log.debug("Initial header: " + str(vcf_reader.infos))
5348
5349        # Existing annotations
5350        for vcf_annotation in self.get_header().infos:
5351
5352            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5353            log.debug(
5354                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5355            )
5356
5357        # Memory limit
5358        # if config.get("memory", None):
5359        #     memory_limit = config.get("memory", "8G")
5360        # else:
5361        #     memory_limit = "8G"
5362        memory_limit = self.get_memory("8G")
5363        log.debug(f"memory_limit: {memory_limit}")
5364
5365        # snpEff java options
5366        snpeff_java_options = (
5367            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
5368        )
5369        log.debug(f"Exomiser java options: {snpeff_java_options}")
5370
5371        force_update_annotation = True
5372
5373        if "ANN" not in self.get_header().infos or force_update_annotation:
5374
5375            # Check snpEff database
5376            log.debug(f"Check snpEff databases {[assembly]}")
5377            databases_download_snpeff(
5378                folder=snpeff_databases, assemblies=[assembly], config=config
5379            )
5380
5381            # Export VCF file
5382            self.export_variant_vcf(
5383                vcf_file=tmp_vcf_name,
5384                remove_info=True,
5385                add_samples=False,
5386                index=True,
5387            )
5388
5389            # Tmp file
5390            err_files = []
5391            tmp_annotate_vcf = NamedTemporaryFile(
5392                prefix=self.get_prefix(),
5393                dir=self.get_tmp_dir(),
5394                suffix=".vcf",
5395                delete=False,
5396            )
5397            tmp_annotate_vcf_name = tmp_annotate_vcf.name
5398            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5399            err_files.append(tmp_annotate_vcf_name_err)
5400
5401            # Command
5402            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
5403            log.debug(f"Annotation - snpEff command: {snpeff_command}")
5404            run_parallel_commands([snpeff_command], 1)
5405
5406            # Error messages
5407            log.info(f"Error/Warning messages:")
5408            error_message_command_all = []
5409            error_message_command_warning = []
5410            error_message_command_err = []
5411            for err_file in err_files:
5412                with open(err_file, "r") as f:
5413                    for line in f:
5414                        message = line.strip()
5415                        error_message_command_all.append(message)
5416                        if line.startswith("[W::"):
5417                            error_message_command_warning.append(message)
5418                        if line.startswith("[E::"):
5419                            error_message_command_err.append(f"{err_file}: " + message)
5420            # log info
5421            for message in list(
5422                set(error_message_command_err + error_message_command_warning)
5423            ):
5424                log.info(f"   {message}")
5425            # debug info
5426            for message in list(set(error_message_command_all)):
5427                log.debug(f"   {message}")
5428            # failed
5429            if len(error_message_command_err):
5430                log.error("Annotation failed: Error in commands")
5431                raise ValueError("Annotation failed: Error in commands")
5432
5433            # Find annotation in header
5434            with open(tmp_annotate_vcf_name, "rt") as f:
5435                header_list = self.read_vcf_header(f)
5436            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5437
5438            for ann in annovar_vcf_header.infos:
5439                if ann not in self.get_header().infos:
5440                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5441
5442            # Update variants
5443            log.info(f"Annotation - Updating...")
5444            self.update_from_vcf(tmp_annotate_vcf_name)
5445
5446        else:
5447            if "ANN" in self.get_header().infos:
5448                log.debug(f"Existing snpEff annotations in VCF")
5449            if force_update_annotation:
5450                log.debug(f"Existing snpEff annotations in VCF - annotation forced")

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def annotation_annovar(self, threads: int = None) -> None:
5452    def annotation_annovar(self, threads: int = None) -> None:
5453        """
5454        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
5455        annotations
5456
5457        :param threads: number of threads to use
5458        :return: the value of the variable "return_value".
5459        """
5460
5461        # DEBUG
5462        log.debug("Start annotation with Annovar databases")
5463
5464        # Threads
5465        if not threads:
5466            threads = self.get_threads()
5467        log.debug("Threads: " + str(threads))
5468
5469        # Tmp en Err files
5470        tmp_files = []
5471        err_files = []
5472
5473        # DEBUG
5474        delete_tmp = True
5475        if self.get_config().get("verbosity", "warning") in ["debug"]:
5476            delete_tmp = False
5477            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5478
5479        # Config
5480        config = self.get_config()
5481        log.debug("Config: " + str(config))
5482
5483        # Config - Folders - Databases
5484        databases_folders = (
5485            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
5486        )
5487        log.debug("Databases annotations: " + str(databases_folders))
5488
5489        # Config - annovar bin command
5490        annovar_bin_command = get_bin_command(
5491            bin="table_annovar.pl",
5492            tool="annovar",
5493            bin_type="perl",
5494            config=config,
5495            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
5496        )
5497        if not annovar_bin_command:
5498            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
5499            log.error(msg_err)
5500            raise ValueError(msg_err)
5501
5502        # Config - BCFTools bin command
5503        bcftools_bin_command = get_bin_command(
5504            bin="bcftools",
5505            tool="bcftools",
5506            bin_type="bin",
5507            config=config,
5508            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
5509        )
5510        if not bcftools_bin_command:
5511            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
5512            log.error(msg_err)
5513            raise ValueError(msg_err)
5514
5515        # Config - annovar databases
5516        annovar_databases = (
5517            config.get("folders", {})
5518            .get("databases", {})
5519            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
5520        )
5521        if annovar_databases is not None:
5522            if isinstance(annovar_databases, list):
5523                annovar_databases = full_path(annovar_databases[0])
5524                log.warning(f"Annovar databases folder '{annovar_databases}' selected")
5525            annovar_databases = full_path(annovar_databases)
5526            if not os.path.exists(annovar_databases):
5527                log.info(f"Annovar databases folder '{annovar_databases}' created")
5528                Path(annovar_databases).mkdir(parents=True, exist_ok=True)
5529        else:
5530            msg_err = f"Annovar databases configuration failed"
5531            log.error(msg_err)
5532            raise ValueError(msg_err)
5533
5534        # Param
5535        param = self.get_param()
5536        log.debug("Param: " + str(param))
5537
5538        # Param - options
5539        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
5540        log.debug("Options: " + str(options))
5541
5542        # Param - annotations
5543        annotations = (
5544            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
5545        )
5546        log.debug("Annotations: " + str(annotations))
5547
5548        # Param - Assembly
5549        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
5550
5551        # Annovar database assembly
5552        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
5553        if annovar_databases_assembly != "" and not os.path.exists(
5554            annovar_databases_assembly
5555        ):
5556            os.makedirs(annovar_databases_assembly)
5557
5558        # Data
5559        table_variants = self.get_table_variants()
5560
5561        # Check if not empty
5562        log.debug("Check if not empty")
5563        sql_query_chromosomes = (
5564            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5565        )
5566        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
5567        if not sql_query_chromosomes_df["count"][0]:
5568            log.info(f"VCF empty")
5569            return
5570
5571        # VCF header
5572        vcf_reader = self.get_header()
5573        log.debug("Initial header: " + str(vcf_reader.infos))
5574
5575        # Existing annotations
5576        for vcf_annotation in self.get_header().infos:
5577
5578            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5579            log.debug(
5580                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5581            )
5582
5583        force_update_annotation = True
5584
5585        if annotations:
5586
5587            commands = []
5588            tmp_annotates_vcf_name_list = []
5589
5590            # Export in VCF
5591            log.debug("Create initial file to annotate")
5592            tmp_vcf = NamedTemporaryFile(
5593                prefix=self.get_prefix(),
5594                dir=self.get_tmp_dir(),
5595                suffix=".vcf.gz",
5596                delete=False,
5597            )
5598            tmp_vcf_name = tmp_vcf.name
5599            tmp_files.append(tmp_vcf_name)
5600            tmp_files.append(tmp_vcf_name + ".tbi")
5601
5602            # Export VCF file
5603            self.export_variant_vcf(
5604                vcf_file=tmp_vcf_name,
5605                remove_info=".",
5606                add_samples=False,
5607                index=True,
5608            )
5609
5610            # Create file for field rename
5611            log.debug("Create file for field rename")
5612            tmp_rename = NamedTemporaryFile(
5613                prefix=self.get_prefix(),
5614                dir=self.get_tmp_dir(),
5615                suffix=".rename",
5616                delete=False,
5617            )
5618            tmp_rename_name = tmp_rename.name
5619            tmp_files.append(tmp_rename_name)
5620
5621            # Check Annovar database
5622            log.debug(
5623                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
5624            )
5625            databases_download_annovar(
5626                folder=annovar_databases,
5627                files=list(annotations.keys()),
5628                assemblies=[assembly],
5629            )
5630
5631            for annotation in annotations:
5632                annotation_fields = annotations[annotation]
5633
5634                if not annotation_fields:
5635                    annotation_fields = {"INFO": None}
5636
5637                log.info(f"Annotations Annovar - database '{annotation}'")
5638                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
5639
5640                # Tmp file for annovar
5641                err_files = []
5642                tmp_annotate_vcf_directory = TemporaryDirectory(
5643                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
5644                )
5645                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
5646                tmp_annotate_vcf_name_annovar = (
5647                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
5648                )
5649                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
5650                err_files.append(tmp_annotate_vcf_name_err)
5651                tmp_files.append(tmp_annotate_vcf_name_err)
5652
5653                # Tmp file final vcf annotated by annovar
5654                tmp_annotate_vcf = NamedTemporaryFile(
5655                    prefix=self.get_prefix(),
5656                    dir=self.get_tmp_dir(),
5657                    suffix=".vcf.gz",
5658                    delete=False,
5659                )
5660                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5661                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
5662                tmp_files.append(tmp_annotate_vcf_name)
5663                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
5664
5665                # Number of fields
5666                annotation_list = []
5667                annotation_renamed_list = []
5668
5669                for annotation_field in annotation_fields:
5670
5671                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
5672                    annotation_fields_new_name = annotation_fields.get(
5673                        annotation_field, annotation_field
5674                    )
5675                    if not annotation_fields_new_name:
5676                        annotation_fields_new_name = annotation_field
5677
5678                    if (
5679                        force_update_annotation
5680                        or annotation_fields_new_name not in self.get_header().infos
5681                    ):
5682                        annotation_list.append(annotation_field)
5683                        annotation_renamed_list.append(annotation_fields_new_name)
5684                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
5685                        log.warning(
5686                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
5687                        )
5688
5689                    # Add rename info
5690                    run_parallel_commands(
5691                        [
5692                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
5693                        ],
5694                        1,
5695                    )
5696
5697                # log.debug("fields_to_removed: " + str(fields_to_removed))
5698                log.debug("annotation_list: " + str(annotation_list))
5699
5700                # protocol
5701                protocol = annotation
5702
5703                # argument
5704                argument = ""
5705
5706                # operation
5707                operation = "f"
5708                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
5709                    "ensGene"
5710                ):
5711                    operation = "g"
5712                    if options.get("genebase", None):
5713                        argument = f"""'{options.get("genebase","")}'"""
5714                elif annotation in ["cytoBand"]:
5715                    operation = "r"
5716
5717                # argument option
5718                argument_option = ""
5719                if argument != "":
5720                    argument_option = " --argument " + argument
5721
5722                # command options
5723                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
5724                for option in options:
5725                    if option not in ["genebase"]:
5726                        command_options += f""" --{option}={options[option]}"""
5727
5728                # Command
5729
5730                # Command - Annovar
5731                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
5732                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
5733
5734                # Command - start pipe
5735                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
5736
5737                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
5738                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
5739
5740                # Command - Special characters (refGene annotation)
5741                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
5742
5743                # Command - Clean empty fields (with value ".")
5744                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
5745
5746                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
5747                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
5748                if "ALL" not in annotation_list and "INFO" not in annotation_list:
5749                    # for ann in annotation_renamed_list:
5750                    for ann in annotation_list:
5751                        annovar_fields_to_keep.append(f"^INFO/{ann}")
5752
5753                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
5754
5755                # Command - indexing
5756                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
5757
5758                log.debug(f"Annotation - Annovar command: {command_annovar}")
5759                run_parallel_commands([command_annovar], 1)
5760
5761                # Error messages
5762                log.info(f"Error/Warning messages:")
5763                error_message_command_all = []
5764                error_message_command_warning = []
5765                error_message_command_err = []
5766                for err_file in err_files:
5767                    with open(err_file, "r") as f:
5768                        for line in f:
5769                            message = line.strip()
5770                            error_message_command_all.append(message)
5771                            if line.startswith("[W::") or line.startswith("WARNING"):
5772                                error_message_command_warning.append(message)
5773                            if line.startswith("[E::") or line.startswith("ERROR"):
5774                                error_message_command_err.append(
5775                                    f"{err_file}: " + message
5776                                )
5777                # log info
5778                for message in list(
5779                    set(error_message_command_err + error_message_command_warning)
5780                ):
5781                    log.info(f"   {message}")
5782                # debug info
5783                for message in list(set(error_message_command_all)):
5784                    log.debug(f"   {message}")
5785                # failed
5786                if len(error_message_command_err):
5787                    log.error("Annotation failed: Error in commands")
5788                    raise ValueError("Annotation failed: Error in commands")
5789
5790            if tmp_annotates_vcf_name_list:
5791
5792                # List of annotated files
5793                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
5794
5795                # Tmp file
5796                tmp_annotate_vcf = NamedTemporaryFile(
5797                    prefix=self.get_prefix(),
5798                    dir=self.get_tmp_dir(),
5799                    suffix=".vcf.gz",
5800                    delete=False,
5801                )
5802                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5803                tmp_files.append(tmp_annotate_vcf_name)
5804                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5805                err_files.append(tmp_annotate_vcf_name_err)
5806                tmp_files.append(tmp_annotate_vcf_name_err)
5807
5808                # Command merge
5809                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
5810                log.info(
5811                    f"Annotation Annovar - Annotation merging "
5812                    + str(len(tmp_annotates_vcf_name_list))
5813                    + " annotated files"
5814                )
5815                log.debug(f"Annotation - merge command: {merge_command}")
5816                run_parallel_commands([merge_command], 1)
5817
5818                # Find annotation in header
5819                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
5820                    header_list = self.read_vcf_header(f)
5821                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5822
5823                for ann in annovar_vcf_header.infos:
5824                    if ann not in self.get_header().infos:
5825                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5826
5827                # Update variants
5828                log.info(f"Annotation Annovar - Updating...")
5829                self.update_from_vcf(tmp_annotate_vcf_name)
5830
5831            # Clean files
5832            # Tmp file remove command
5833            if True:
5834                tmp_files_remove_command = ""
5835                if tmp_files:
5836                    tmp_files_remove_command = " ".join(tmp_files)
5837                clean_command = f" rm -f {tmp_files_remove_command} "
5838                log.debug(f"Annotation Annovar - Annotation cleaning ")
5839                log.debug(f"Annotation - cleaning command: {clean_command}")
5840                run_parallel_commands([clean_command], 1)

It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations

Parameters
  • threads: number of threads to use
Returns

the value of the variable "return_value".

def annotation_parquet(self, threads: int = None) -> None:
5843    def annotation_parquet(self, threads: int = None) -> None:
5844        """
5845        It takes a VCF file, and annotates it with a parquet file
5846
5847        :param threads: number of threads to use for the annotation
5848        :return: the value of the variable "result".
5849        """
5850
5851        # DEBUG
5852        log.debug("Start annotation with parquet databases")
5853
5854        # Threads
5855        if not threads:
5856            threads = self.get_threads()
5857        log.debug("Threads: " + str(threads))
5858
5859        # DEBUG
5860        delete_tmp = True
5861        if self.get_config().get("verbosity", "warning") in ["debug"]:
5862            delete_tmp = False
5863            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5864
5865        # Config
5866        databases_folders = set(
5867            self.get_config()
5868            .get("folders", {})
5869            .get("databases", {})
5870            .get("annotations", ["."])
5871            + self.get_config()
5872            .get("folders", {})
5873            .get("databases", {})
5874            .get("parquet", ["."])
5875        )
5876        log.debug("Databases annotations: " + str(databases_folders))
5877
5878        # Param
5879        annotations = (
5880            self.get_param()
5881            .get("annotation", {})
5882            .get("parquet", {})
5883            .get("annotations", None)
5884        )
5885        log.debug("Annotations: " + str(annotations))
5886
5887        # Assembly
5888        assembly = self.get_param().get(
5889            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
5890        )
5891
5892        # Force Update Annotation
5893        force_update_annotation = (
5894            self.get_param()
5895            .get("annotation", {})
5896            .get("options", {})
5897            .get("annotations_update", False)
5898        )
5899        log.debug(f"force_update_annotation={force_update_annotation}")
5900        force_append_annotation = (
5901            self.get_param()
5902            .get("annotation", {})
5903            .get("options", {})
5904            .get("annotations_append", False)
5905        )
5906        log.debug(f"force_append_annotation={force_append_annotation}")
5907
5908        # Data
5909        table_variants = self.get_table_variants()
5910
5911        # Check if not empty
5912        log.debug("Check if not empty")
5913        sql_query_chromosomes_df = self.get_query_to_df(
5914            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
5915        )
5916        if not sql_query_chromosomes_df["count"][0]:
5917            log.info(f"VCF empty")
5918            return
5919
5920        # VCF header
5921        vcf_reader = self.get_header()
5922        log.debug("Initial header: " + str(vcf_reader.infos))
5923
5924        # Nb Variants POS
5925        log.debug("NB Variants Start")
5926        nb_variants = self.conn.execute(
5927            f"SELECT count(*) AS count FROM variants"
5928        ).fetchdf()["count"][0]
5929        log.debug("NB Variants Stop")
5930
5931        # Existing annotations
5932        for vcf_annotation in self.get_header().infos:
5933
5934            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5935            log.debug(
5936                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5937            )
5938
5939        # Added columns
5940        added_columns = []
5941
5942        # drop indexes
5943        log.debug(f"Drop indexes...")
5944        self.drop_indexes()
5945
5946        if annotations:
5947
5948            if "ALL" in annotations:
5949
5950                all_param = annotations.get("ALL", {})
5951                all_param_formats = all_param.get("formats", None)
5952                all_param_releases = all_param.get("releases", None)
5953
5954                databases_infos_dict = self.scan_databases(
5955                    database_formats=all_param_formats,
5956                    database_releases=all_param_releases,
5957                )
5958                for database_infos in databases_infos_dict.keys():
5959                    if database_infos not in annotations:
5960                        annotations[database_infos] = {"INFO": None}
5961
5962            for annotation in annotations:
5963
5964                if annotation in ["ALL"]:
5965                    continue
5966
5967                # Annotation Name
5968                annotation_name = os.path.basename(annotation)
5969
5970                # Annotation fields
5971                annotation_fields = annotations[annotation]
5972                if not annotation_fields:
5973                    annotation_fields = {"INFO": None}
5974
5975                log.debug(f"Annotation '{annotation_name}'")
5976                log.debug(
5977                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
5978                )
5979
5980                # Create Database
5981                database = Database(
5982                    database=annotation,
5983                    databases_folders=databases_folders,
5984                    assembly=assembly,
5985                )
5986
5987                # Find files
5988                parquet_file = database.get_database()
5989                parquet_hdr_file = database.get_header_file()
5990                parquet_type = database.get_type()
5991
5992                # Check if files exists
5993                if not parquet_file or not parquet_hdr_file:
5994                    msg_err_list = []
5995                    if not parquet_file:
5996                        msg_err_list.append(
5997                            f"Annotation failed: Annotation file not found"
5998                        )
5999                    if parquet_file and not parquet_hdr_file:
6000                        msg_err_list.append(
6001                            f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'"
6002                        )
6003
6004                    log.error(". ".join(msg_err_list))
6005                    raise ValueError(". ".join(msg_err_list))
6006                else:
6007                    # Get parquet connexion
6008                    parquet_sql_attach = database.get_sql_database_attach(
6009                        output="query"
6010                    )
6011                    if parquet_sql_attach:
6012                        self.conn.execute(parquet_sql_attach)
6013                    parquet_file_link = database.get_sql_database_link()
6014                    # Log
6015                    log.debug(
6016                        f"Annotation '{annotation_name}' - file: "
6017                        + str(parquet_file)
6018                        + " and "
6019                        + str(parquet_hdr_file)
6020                    )
6021
6022                    # Database full header columns
6023                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
6024                        parquet_hdr_file
6025                    )
6026                    # Log
6027                    log.debug(
6028                        "Annotation database header columns : "
6029                        + str(parquet_hdr_vcf_header_columns)
6030                    )
6031
6032                    # Load header as VCF object
6033                    parquet_hdr_vcf_header_infos = database.get_header().infos
6034                    # Log
6035                    log.debug(
6036                        "Annotation database header: "
6037                        + str(parquet_hdr_vcf_header_infos)
6038                    )
6039
6040                    # Get extra infos
6041                    parquet_columns = database.get_extra_columns()
6042                    # Log
6043                    log.debug("Annotation database Columns: " + str(parquet_columns))
6044
6045                    # Add extra columns if "ALL" in annotation_fields
6046                    # if "ALL" in annotation_fields:
6047                    #     allow_add_extra_column = True
6048                    if "ALL" in annotation_fields and database.get_extra_columns():
6049                        for extra_column in database.get_extra_columns():
6050                            if (
6051                                extra_column not in annotation_fields
6052                                and extra_column.replace("INFO/", "")
6053                                not in parquet_hdr_vcf_header_infos
6054                            ):
6055                                parquet_hdr_vcf_header_infos[extra_column] = (
6056                                    vcf.parser._Info(
6057                                        extra_column,
6058                                        ".",
6059                                        "String",
6060                                        f"{extra_column} description",
6061                                        "unknown",
6062                                        "unknown",
6063                                        self.code_type_map["String"],
6064                                    )
6065                                )
6066
6067                    # For all fields in database
6068                    annotation_fields_all = False
6069                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
6070                        annotation_fields_all = True
6071                        annotation_fields = {
6072                            key: key for key in parquet_hdr_vcf_header_infos
6073                        }
6074
6075                        log.debug(
6076                            "Annotation database header - All annotations added: "
6077                            + str(annotation_fields)
6078                        )
6079
6080                    # Init
6081
6082                    # List of annotation fields to use
6083                    sql_query_annotation_update_info_sets = []
6084
6085                    # List of annotation to agregate
6086                    sql_query_annotation_to_agregate = []
6087
6088                    # Number of fields
6089                    nb_annotation_field = 0
6090
6091                    # Annotation fields processed
6092                    annotation_fields_processed = []
6093
6094                    # Columns mapping
6095                    map_columns = database.map_columns(
6096                        columns=annotation_fields, prefixes=["INFO/"]
6097                    )
6098
6099                    # Query dict for fields to remove (update option)
6100                    query_dict_remove = {}
6101
6102                    # Fetch Anotation fields
6103                    for annotation_field in annotation_fields:
6104
6105                        # annotation_field_column
6106                        annotation_field_column = map_columns.get(
6107                            annotation_field, "INFO"
6108                        )
6109
6110                        # field new name, if parametered
6111                        annotation_fields_new_name = annotation_fields.get(
6112                            annotation_field, annotation_field
6113                        )
6114                        if not annotation_fields_new_name:
6115                            annotation_fields_new_name = annotation_field
6116
6117                        # To annotate
6118                        # force_update_annotation = True
6119                        # force_append_annotation = True
6120                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
6121                        if annotation_field in parquet_hdr_vcf_header_infos and (
6122                            force_update_annotation
6123                            or force_append_annotation
6124                            or (
6125                                annotation_fields_new_name
6126                                not in self.get_header().infos
6127                            )
6128                        ):
6129
6130                            # Add field to annotation to process list
6131                            annotation_fields_processed.append(
6132                                annotation_fields_new_name
6133                            )
6134
6135                            # explode infos for the field
6136                            annotation_fields_new_name_info_msg = ""
6137                            if (
6138                                force_update_annotation
6139                                and annotation_fields_new_name
6140                                in self.get_header().infos
6141                            ):
6142                                # Remove field from INFO
6143                                query = f"""
6144                                    UPDATE {table_variants} as table_variants
6145                                    SET INFO = REGEXP_REPLACE(
6146                                                concat(table_variants.INFO,''),
6147                                                ';*{annotation_fields_new_name}=[^;]*',
6148                                                ''
6149                                                )
6150                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
6151                                """
6152                                annotation_fields_new_name_info_msg = " [update]"
6153                                query_dict_remove[
6154                                    f"remove 'INFO/{annotation_fields_new_name}'"
6155                                ] = query
6156
6157                            # Sep between fields in INFO
6158                            nb_annotation_field += 1
6159                            if nb_annotation_field > 1:
6160                                annotation_field_sep = ";"
6161                            else:
6162                                annotation_field_sep = ""
6163
6164                            log.info(
6165                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
6166                            )
6167
6168                            # Add INFO field to header
6169                            parquet_hdr_vcf_header_infos_number = (
6170                                parquet_hdr_vcf_header_infos[annotation_field].num
6171                                or "."
6172                            )
6173                            parquet_hdr_vcf_header_infos_type = (
6174                                parquet_hdr_vcf_header_infos[annotation_field].type
6175                                or "String"
6176                            )
6177                            parquet_hdr_vcf_header_infos_description = (
6178                                parquet_hdr_vcf_header_infos[annotation_field].desc
6179                                or f"{annotation_field} description"
6180                            )
6181                            parquet_hdr_vcf_header_infos_source = (
6182                                parquet_hdr_vcf_header_infos[annotation_field].source
6183                                or "unknown"
6184                            )
6185                            parquet_hdr_vcf_header_infos_version = (
6186                                parquet_hdr_vcf_header_infos[annotation_field].version
6187                                or "unknown"
6188                            )
6189
6190                            vcf_reader.infos[annotation_fields_new_name] = (
6191                                vcf.parser._Info(
6192                                    annotation_fields_new_name,
6193                                    parquet_hdr_vcf_header_infos_number,
6194                                    parquet_hdr_vcf_header_infos_type,
6195                                    parquet_hdr_vcf_header_infos_description,
6196                                    parquet_hdr_vcf_header_infos_source,
6197                                    parquet_hdr_vcf_header_infos_version,
6198                                    self.code_type_map[
6199                                        parquet_hdr_vcf_header_infos_type
6200                                    ],
6201                                )
6202                            )
6203
6204                            # Append
6205                            if force_append_annotation:
6206                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
6207                            else:
6208                                query_case_when_append = ""
6209
6210                            # Annotation/Update query fields
6211                            # Found in INFO column
6212                            if (
6213                                annotation_field_column == "INFO"
6214                                and "INFO" in parquet_hdr_vcf_header_columns
6215                            ):
6216                                sql_query_annotation_update_info_sets.append(
6217                                    f"""
6218                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
6219                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
6220                                        ELSE ''
6221                                    END
6222                                """
6223                                )
6224                            # Found in a specific column
6225                            else:
6226                                sql_query_annotation_update_info_sets.append(
6227                                    f"""
6228                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
6229                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
6230                                        ELSE ''
6231                                    END
6232                                """
6233                                )
6234                                sql_query_annotation_to_agregate.append(
6235                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
6236                                )
6237
6238                        # Not to annotate
6239                        else:
6240
6241                            if force_update_annotation:
6242                                annotation_message = "forced"
6243                            else:
6244                                annotation_message = "skipped"
6245
6246                            if annotation_field not in parquet_hdr_vcf_header_infos:
6247                                log.warning(
6248                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
6249                                )
6250                            if annotation_fields_new_name in self.get_header().infos:
6251                                log.warning(
6252                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
6253                                )
6254
6255                    # Check if ALL fields have to be annotated. Thus concat all INFO field
6256                    # allow_annotation_full_info = True
6257                    allow_annotation_full_info = not force_append_annotation
6258
6259                    if parquet_type in ["regions"]:
6260                        allow_annotation_full_info = False
6261
6262                    if (
6263                        allow_annotation_full_info
6264                        and nb_annotation_field == len(annotation_fields)
6265                        and annotation_fields_all
6266                        and (
6267                            "INFO" in parquet_hdr_vcf_header_columns
6268                            and "INFO" in database.get_extra_columns()
6269                        )
6270                    ):
6271                        log.debug("Column INFO annotation enabled")
6272                        sql_query_annotation_update_info_sets = []
6273                        sql_query_annotation_update_info_sets.append(
6274                            f" table_parquet.INFO "
6275                        )
6276
6277                    if sql_query_annotation_update_info_sets:
6278
6279                        # Annotate
6280                        log.info(f"Annotation '{annotation_name}' - Annotation...")
6281
6282                        # Join query annotation update info sets for SQL
6283                        sql_query_annotation_update_info_sets_sql = ",".join(
6284                            sql_query_annotation_update_info_sets
6285                        )
6286
6287                        # Check chromosomes list (and variants infos)
6288                        sql_query_chromosomes = f"""
6289                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
6290                            FROM {table_variants} as table_variants
6291                            GROUP BY table_variants."#CHROM"
6292                            ORDER BY table_variants."#CHROM"
6293                            """
6294                        sql_query_chromosomes_df = self.conn.execute(
6295                            sql_query_chromosomes
6296                        ).df()
6297                        sql_query_chromosomes_dict = {
6298                            entry["CHROM"]: {
6299                                "count": entry["count_variants"],
6300                                "min": entry["min_variants"],
6301                                "max": entry["max_variants"],
6302                            }
6303                            for index, entry in sql_query_chromosomes_df.iterrows()
6304                        }
6305
6306                        # Init
6307                        nb_of_query = 0
6308                        nb_of_variant_annotated = 0
6309                        query_dict = query_dict_remove
6310
6311                        # for chrom in sql_query_chromosomes_df["CHROM"]:
6312                        for chrom in sql_query_chromosomes_dict:
6313
6314                            # Number of variant by chromosome
6315                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
6316                                chrom, {}
6317                            ).get("count", 0)
6318
6319                            log.debug(
6320                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
6321                            )
6322
6323                            # Annotation with regions database
6324                            if parquet_type in ["regions"]:
6325                                sql_query_annotation_from_clause = f"""
6326                                    FROM (
6327                                        SELECT 
6328                                            '{chrom}' AS \"#CHROM\",
6329                                            table_variants_from.\"POS\" AS \"POS\",
6330                                            {",".join(sql_query_annotation_to_agregate)}
6331                                        FROM {table_variants} as table_variants_from
6332                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
6333                                            table_parquet_from."#CHROM" = '{chrom}'
6334                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
6335                                            AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
6336                                        )
6337                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
6338                                        GROUP BY table_variants_from.\"POS\"
6339                                        )
6340                                        as table_parquet
6341                                """
6342
6343                                sql_query_annotation_where_clause = """
6344                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
6345                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
6346                                """
6347
6348                            # Annotation with variants database
6349                            else:
6350                                sql_query_annotation_from_clause = f"""
6351                                    FROM {parquet_file_link} as table_parquet
6352                                """
6353                                sql_query_annotation_where_clause = f"""
6354                                    table_variants."#CHROM" = '{chrom}'
6355                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
6356                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
6357                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
6358                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
6359                                """
6360
6361                            # Create update query
6362                            sql_query_annotation_chrom_interval_pos = f"""
6363                                UPDATE {table_variants} as table_variants
6364                                    SET INFO = 
6365                                        concat(
6366                                            CASE WHEN table_variants.INFO NOT IN ('','.')
6367                                                THEN table_variants.INFO
6368                                                ELSE ''
6369                                            END
6370                                            ,
6371                                            CASE WHEN table_variants.INFO NOT IN ('','.')
6372                                                        AND (
6373                                                        concat({sql_query_annotation_update_info_sets_sql})
6374                                                        )
6375                                                        NOT IN ('','.') 
6376                                                    THEN ';'
6377                                                    ELSE ''
6378                                            END
6379                                            ,
6380                                            {sql_query_annotation_update_info_sets_sql}
6381                                            )
6382                                    {sql_query_annotation_from_clause}
6383                                    WHERE {sql_query_annotation_where_clause}
6384                                    ;
6385                                """
6386
6387                            # Add update query to dict
6388                            query_dict[
6389                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
6390                            ] = sql_query_annotation_chrom_interval_pos
6391
6392                        nb_of_query = len(query_dict)
6393                        num_query = 0
6394
6395                        # SET max_expression_depth TO x
6396                        self.conn.execute("SET max_expression_depth TO 10000")
6397
6398                        for query_name in query_dict:
6399                            query = query_dict[query_name]
6400                            num_query += 1
6401                            log.info(
6402                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
6403                            )
6404                            result = self.conn.execute(query)
6405                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
6406                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
6407                            log.info(
6408                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
6409                            )
6410
6411                        log.info(
6412                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
6413                        )
6414
6415                    else:
6416
6417                        log.info(
6418                            f"Annotation '{annotation_name}' - No Annotations available"
6419                        )
6420
6421                    log.debug("Final header: " + str(vcf_reader.infos))
6422
6423        # Remove added columns
6424        for added_column in added_columns:
6425            self.drop_column(column=added_column)

It takes a VCF file, and annotates it with a parquet file

Parameters
  • threads: number of threads to use for the annotation
Returns

the value of the variable "result".

def annotation_splice(self, threads: int = None) -> None:
6427    def annotation_splice(self, threads: int = None) -> None:
6428        """
6429        This function annotate with snpEff
6430
6431        :param threads: The number of threads to use
6432        :return: the value of the variable "return_value".
6433        """
6434
6435        # DEBUG
6436        log.debug("Start annotation with splice tools")
6437
6438        # Threads
6439        if not threads:
6440            threads = self.get_threads()
6441        log.debug("Threads: " + str(threads))
6442
6443        # DEBUG
6444        delete_tmp = True
6445        if self.get_config().get("verbosity", "warning") in ["debug"]:
6446            delete_tmp = False
6447            log.debug("Delete tmp files/folders: " + str(delete_tmp))
6448
6449        # Config
6450        config = self.get_config()
6451        log.debug("Config: " + str(config))
6452        splice_config = config.get("tools", {}).get("splice", {})
6453        if not splice_config:
6454            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
6455            msg_err = "No Splice tool config"
6456            raise ValueError(msg_err)
6457        log.debug(f"splice_config: {splice_config}")
6458
6459        # Config - Folders - Databases
6460        databases_folders = (
6461            config.get("folders", {}).get("databases", {}).get("splice", ["."])
6462        )
6463        log.debug("Databases annotations: " + str(databases_folders))
6464
6465        # Splice docker image
6466        splice_docker_image = splice_config.get("docker").get("image")
6467
6468        # Pull splice image if it's not already there
6469        if not check_docker_image_exists(splice_docker_image):
6470            log.warning(
6471                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
6472            )
6473            try:
6474                command(f"docker pull {splice_config.get('docker').get('image')}")
6475            except subprocess.CalledProcessError:
6476                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
6477                log.error(msg_err)
6478                raise ValueError(msg_err)
6479
6480        # Config - splice databases
6481        splice_databases = (
6482            config.get("folders", {})
6483            .get("databases", {})
6484            .get("splice", DEFAULT_SPLICE_FOLDER)
6485        )
6486        splice_databases = full_path(splice_databases)
6487
6488        # Param
6489        param = self.get_param()
6490        log.debug("Param: " + str(param))
6491
6492        # Param
6493        options = param.get("annotation", {}).get("splice", {}).get("options", {})
6494        log.debug("Options: " + str(options))
6495
6496        # Data
6497        table_variants = self.get_table_variants()
6498
6499        # Check if not empty
6500        log.debug("Check if not empty")
6501        sql_query_chromosomes = (
6502            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
6503        )
6504        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
6505            log.info("VCF empty")
6506            return None
6507
6508        # Export in VCF
6509        log.debug("Create initial file to annotate")
6510
6511        # Create output folder / work folder
6512        if options.get("output_folder", ""):
6513            output_folder = options.get("output_folder", "")
6514            if not os.path.exists(output_folder):
6515                Path(output_folder).mkdir(parents=True, exist_ok=True)
6516        else:
6517            output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
6518            if not os.path.exists(output_folder):
6519                Path(output_folder).mkdir(parents=True, exist_ok=True)
6520
6521        if options.get("workdir", ""):
6522            workdir = options.get("workdir", "")
6523        else:
6524            workdir = "/work"
6525
6526        # Create tmp VCF file
6527        tmp_vcf = NamedTemporaryFile(
6528            prefix=self.get_prefix(),
6529            dir=output_folder,
6530            suffix=".vcf",
6531            delete=False,
6532        )
6533        tmp_vcf_name = tmp_vcf.name
6534
6535        # VCF header
6536        header = self.get_header()
6537
6538        # Existing annotations
6539        for vcf_annotation in self.get_header().infos:
6540
6541            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
6542            log.debug(
6543                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
6544            )
6545
6546        # Memory limit
6547        if config.get("memory", None):
6548            memory_limit = config.get("memory", "8G").upper()
6549            # upper()
6550        else:
6551            memory_limit = "8G"
6552        log.debug(f"memory_limit: {memory_limit}")
6553
6554        # Check number of variants to annotate
6555        where_clause_regex_spliceai = r"SpliceAI_\w+"
6556        where_clause_regex_spip = r"SPiP_\w+"
6557        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
6558        df_list_of_variants_to_annotate = self.get_query_to_df(
6559            query=f""" SELECT * FROM variants {where_clause} """
6560        )
6561        if len(df_list_of_variants_to_annotate) == 0:
6562            log.warning(
6563                f"No variants to annotate with splice. Variants probably already annotated with splice"
6564            )
6565            return None
6566        else:
6567            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
6568
6569        # Export VCF file
6570        self.export_variant_vcf(
6571            vcf_file=tmp_vcf_name,
6572            remove_info=True,
6573            add_samples=True,
6574            index=False,
6575            where_clause=where_clause,
6576        )
6577        mount = [f" -v {path}:{path}:rw" for path in [output_folder]]
6578        if any(value for value in splice_config.values() if value is None):
6579            log.warning("At least one splice config parameter is empty")
6580            # exit annotation_splice
6581            return None
6582
6583        # Params in splice nf
6584        def check_values(dico: dict):
6585            """
6586            Ensure parameters for NF splice pipeline
6587            """
6588            for key, val in dico.items():
6589                if key == "genome":
6590                    if any(
6591                        assemb in options.get("genome", {})
6592                        for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
6593                    ):
6594                        yield f"--{key} hg19"
6595                    elif any(
6596                        assemb in options.get("genome", {})
6597                        for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
6598                    ):
6599                        yield f"--{key} hg38"
6600                elif (
6601                    (isinstance(val, str) and val)
6602                    or isinstance(val, int)
6603                    or isinstance(val, bool)
6604                ):
6605                    yield f"--{key} {val}"
6606
6607        # Genome
6608        genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
6609        options["genome"] = genome
6610        # NF params
6611        nf_params = []
6612        # Add options
6613        if options:
6614            log.debug(options)
6615            nf_params = list(check_values(options))
6616            log.debug(f"Splice NF params: {' '.join(nf_params)}")
6617        else:
6618            log.debug("No NF params provided")
6619        # Add threads
6620        if "threads" not in options.keys():
6621            nf_params.append(f"--threads {threads}")
6622        # Genome path
6623        genome_path = find_genome(
6624            config.get("folders", {})
6625            .get("databases", {})
6626            .get("genomes", DEFAULT_GENOME_FOLDER),
6627            file=f"{genome}.fa",
6628        )
6629        # Add genome path
6630        if not genome_path:
6631            raise ValueError(
6632                f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
6633            )
6634        else:
6635            log.debug(f"Genome: {genome_path}")
6636            nf_params.append(f"--genome_path {genome_path}")
6637
6638        def splice_annotations(options: dict = {}, config: dict = {}) -> list:
6639            """
6640            Setting up updated databases for SPiP and SpliceAI
6641            """
6642
6643            try:
6644
6645                # SpliceAI assembly transcriptome
6646                spliceai_assembly = os.path.join(
6647                    config.get("folders", {}).get("databases", {}).get("spliceai", {}),
6648                    options.get("genome"),
6649                    "transcriptome",
6650                )
6651                spip_assembly = options.get("genome")
6652
6653                spip = find(
6654                    f"transcriptome_{spip_assembly}.RData",
6655                    config.get("folders", {}).get("databases", {}).get("spip", {}),
6656                )
6657                spliceai = find("spliceai.refseq.txt", spliceai_assembly)
6658                log.debug(f"SPiP annotations: {spip}")
6659                log.debug(f"SpliceAI annotations: {spliceai}")
6660                if spip and spliceai:
6661                    return [
6662                        f"--spip_transcriptome {spip}",
6663                        f"--spliceai_transcriptome {spliceai}",
6664                    ]
6665                else:
6666                    log.warning(
6667                        "Can't find splice databases in configuration, use annotations file from image"
6668                    )
6669            except TypeError:
6670                log.warning(
6671                    "Can't find splice databases in configuration, use annotations file from image"
6672                )
6673                return []
6674
6675        # Add options, check if transcriptome option have already beend provided
6676        if (
6677            "spip_transcriptome" not in nf_params
6678            and "spliceai_transcriptome" not in nf_params
6679        ):
6680            splice_reference = splice_annotations(options, config)
6681            if splice_reference:
6682                nf_params.extend(splice_reference)
6683        # nf_params.append(f"--output_folder {output_folder}")
6684        random_uuid = f"HOWARD-SPLICE-{get_random()}"
6685        cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
6686        log.debug(cmd)
6687        splice_config["docker"]["command"] = cmd
6688
6689        # Ensure proxy is set
6690        proxy = [
6691            f"-e {var}={os.getenv(var)}"
6692            for var in ["https_proxy", "http_proxy", "ftp_proxy"]
6693            if os.getenv(var) is not None
6694        ]
6695        docker_cmd = get_bin_command(
6696            tool="splice",
6697            bin_type="docker",
6698            config=config,
6699            default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
6700            add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}",
6701        )
6702        # print(docker_cmd)
6703        # exit()
6704        # Docker debug
6705        # if splice_config.get("rm_container"):
6706        #     rm_container = "--rm"
6707        # else:
6708        #     rm_container = ""
6709        # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
6710        log.debug(docker_cmd)
6711        res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
6712        log.debug(res.stdout)
6713        if res.stderr:
6714            log.error(res.stderr)
6715        res.check_returncode()
6716        # Update variants
6717        log.info("Annotation - Updating...")
6718        # Test find output vcf
6719        log.debug(
6720            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6721        )
6722        output_vcf = []
6723        # Wrong folder to look in
6724        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
6725            if (
6726                files
6727                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6728            ):
6729                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
6730        # log.debug(os.listdir(options.get("output_folder")))
6731        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
6732        if not output_vcf:
6733            log.debug(
6734                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
6735            )
6736        else:
6737            # Get new header from annotated vcf
6738            log.debug(f"Initial header: {len(header.infos)} fields")
6739            # Create new header with splice infos
6740            new_vcf = Variants(input=output_vcf[0])
6741            new_vcf_header = new_vcf.get_header().infos
6742            for keys, infos in new_vcf_header.items():
6743                if keys not in header.infos.keys():
6744                    header.infos[keys] = infos
6745            log.debug(f"New header: {len(header.infos)} fields")
6746            log.debug(f"Splice tmp output: {output_vcf[0]}")
6747            self.update_from_vcf(output_vcf[0])
6748
6749        # Remove file
6750        remove_if_exists(output_vcf)

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def get_config_default(self, name: str) -> dict:
6756    def get_config_default(self, name: str) -> dict:
6757        """
6758        The function `get_config_default` returns a dictionary containing default configurations for
6759        various calculations and prioritizations.
6760
6761        :param name: The `get_config_default` function returns a dictionary containing default
6762        configurations for different calculations and prioritizations. The `name` parameter is used to
6763        specify which specific configuration to retrieve from the dictionary
6764        :type name: str
6765        :return: The function `get_config_default` returns a dictionary containing default configuration
6766        settings for different calculations and prioritizations. The specific configuration settings are
6767        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
6768        matches a key in the `config_default` dictionary, the corresponding configuration settings are
6769        returned. If there is no match, an empty dictionary is returned.
6770        """
6771
6772        config_default = {
6773            "calculations": {
6774                "variant_chr_pos_alt_ref": {
6775                    "type": "sql",
6776                    "name": "variant_chr_pos_alt_ref",
6777                    "description": "Create a variant ID with chromosome, position, alt and ref",
6778                    "available": False,
6779                    "output_column_name": "variant_chr_pos_alt_ref",
6780                    "output_column_type": "String",
6781                    "output_column_description": "variant ID with chromosome, position, alt and ref",
6782                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
6783                    "operation_info": True,
6784                },
6785                "VARTYPE": {
6786                    "type": "sql",
6787                    "name": "VARTYPE",
6788                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
6789                    "available": True,
6790                    "table": "variants",
6791                    "output_column_name": "VARTYPE",
6792                    "output_column_type": "String",
6793                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
6794                    "operation_query": """
6795                            CASE
6796                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
6797                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
6798                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
6799                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
6800                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
6801                                ELSE 'UNDEFINED'
6802                            END
6803                            """,
6804                    "info_fields": ["SVTYPE"],
6805                    "operation_info": True,
6806                },
6807                "snpeff_hgvs": {
6808                    "type": "python",
6809                    "name": "snpeff_hgvs",
6810                    "description": "HGVS nomenclatures from snpEff annotation",
6811                    "available": True,
6812                    "function_name": "calculation_extract_snpeff_hgvs",
6813                    "function_params": ["snpeff_hgvs", "ANN"],
6814                },
6815                "snpeff_ann_explode": {
6816                    "type": "python",
6817                    "name": "snpeff_ann_explode",
6818                    "description": "Explode snpEff annotations with uniquify values",
6819                    "available": True,
6820                    "function_name": "calculation_snpeff_ann_explode",
6821                    "function_params": [False, "fields", "snpeff_", "ANN"],
6822                },
6823                "snpeff_ann_explode_uniquify": {
6824                    "type": "python",
6825                    "name": "snpeff_ann_explode_uniquify",
6826                    "description": "Explode snpEff annotations",
6827                    "available": True,
6828                    "function_name": "calculation_snpeff_ann_explode",
6829                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
6830                },
6831                "snpeff_ann_explode_json": {
6832                    "type": "python",
6833                    "name": "snpeff_ann_explode_json",
6834                    "description": "Explode snpEff annotations in JSON format",
6835                    "available": True,
6836                    "function_name": "calculation_snpeff_ann_explode",
6837                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
6838                },
6839                "NOMEN": {
6840                    "type": "python",
6841                    "name": "NOMEN",
6842                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)",
6843                    "available": True,
6844                    "function_name": "calculation_extract_nomen",
6845                    "function_params": [],
6846                },
6847                "RENAME_INFO_FIELDS": {
6848                    "type": "python",
6849                    "name": "RENAME_INFO_FIELDS",
6850                    "description": "Rename or remove INFO/tags",
6851                    "available": True,
6852                    "function_name": "calculation_rename_info_fields",
6853                    "function_params": [],
6854                },
6855                "FINDBYPIPELINE": {
6856                    "type": "python",
6857                    "name": "FINDBYPIPELINE",
6858                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
6859                    "available": True,
6860                    "function_name": "calculation_find_by_pipeline",
6861                    "function_params": ["findbypipeline"],
6862                },
6863                "FINDBYSAMPLE": {
6864                    "type": "python",
6865                    "name": "FINDBYSAMPLE",
6866                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
6867                    "available": True,
6868                    "function_name": "calculation_find_by_pipeline",
6869                    "function_params": ["findbysample"],
6870                },
6871                "GENOTYPECONCORDANCE": {
6872                    "type": "python",
6873                    "name": "GENOTYPECONCORDANCE",
6874                    "description": "Concordance of genotype for multi caller VCF",
6875                    "available": True,
6876                    "function_name": "calculation_genotype_concordance",
6877                    "function_params": [],
6878                },
6879                "BARCODE": {
6880                    "type": "python",
6881                    "name": "BARCODE",
6882                    "description": "BARCODE as VaRank tool",
6883                    "available": True,
6884                    "function_name": "calculation_barcode",
6885                    "function_params": [],
6886                },
6887                "BARCODEFAMILY": {
6888                    "type": "python",
6889                    "name": "BARCODEFAMILY",
6890                    "description": "BARCODEFAMILY as VaRank tool",
6891                    "available": True,
6892                    "function_name": "calculation_barcode_family",
6893                    "function_params": ["BCF"],
6894                },
6895                "TRIO": {
6896                    "type": "python",
6897                    "name": "TRIO",
6898                    "description": "Inheritance for a trio family",
6899                    "available": True,
6900                    "function_name": "calculation_trio",
6901                    "function_params": [],
6902                },
6903                "VAF": {
6904                    "type": "python",
6905                    "name": "VAF",
6906                    "description": "Variant Allele Frequency (VAF) harmonization",
6907                    "available": True,
6908                    "function_name": "calculation_vaf_normalization",
6909                    "function_params": [],
6910                },
6911                "VAF_stats": {
6912                    "type": "python",
6913                    "name": "VAF_stats",
6914                    "description": "Variant Allele Frequency (VAF) statistics",
6915                    "available": True,
6916                    "function_name": "calculation_genotype_stats",
6917                    "function_params": ["VAF"],
6918                },
6919                "DP_stats": {
6920                    "type": "python",
6921                    "name": "DP_stats",
6922                    "description": "Depth (DP) statistics",
6923                    "available": True,
6924                    "function_name": "calculation_genotype_stats",
6925                    "function_params": ["DP"],
6926                },
6927                "variant_id": {
6928                    "type": "python",
6929                    "name": "variant_id",
6930                    "description": "Variant ID generated from variant position and type",
6931                    "available": True,
6932                    "function_name": "calculation_variant_id",
6933                    "function_params": [],
6934                },
6935                "transcripts_json": {
6936                    "type": "python",
6937                    "name": "transcripts_json",
6938                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
6939                    "available": True,
6940                    "function_name": "calculation_transcripts_annotation",
6941                    "function_params": ["transcripts_json", None],
6942                },
6943                "transcripts_ann": {
6944                    "type": "python",
6945                    "name": "transcripts_ann",
6946                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
6947                    "available": True,
6948                    "function_name": "calculation_transcripts_annotation",
6949                    "function_params": [None, "transcripts_ann"],
6950                },
6951                "transcripts_annotations": {
6952                    "type": "python",
6953                    "name": "transcripts_annotations",
6954                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
6955                    "available": True,
6956                    "function_name": "calculation_transcripts_annotation",
6957                    "function_params": [None, None],
6958                },
6959                "transcripts_prioritization": {
6960                    "type": "python",
6961                    "name": "transcripts_prioritization",
6962                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
6963                    "available": True,
6964                    "function_name": "calculation_transcripts_prioritization",
6965                    "function_params": [],
6966                },
6967                "transcripts_export": {
6968                    "type": "python",
6969                    "name": "transcripts_export",
6970                    "description": "Export transcripts table/view as a file (using param.json)",
6971                    "available": True,
6972                    "function_name": "calculation_transcripts_export",
6973                    "function_params": [],
6974                },
6975            },
6976            "prioritizations": {
6977                "default": {
6978                    "ANN2": [
6979                        {
6980                            "type": "contains",
6981                            "value": "HIGH",
6982                            "score": 5,
6983                            "flag": "PASS",
6984                            "comment": [
6985                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
6986                            ],
6987                        },
6988                        {
6989                            "type": "contains",
6990                            "value": "MODERATE",
6991                            "score": 3,
6992                            "flag": "PASS",
6993                            "comment": [
6994                                "A non-disruptive variant that might change protein effectiveness"
6995                            ],
6996                        },
6997                        {
6998                            "type": "contains",
6999                            "value": "LOW",
7000                            "score": 0,
7001                            "flag": "FILTERED",
7002                            "comment": [
7003                                "Assumed to be mostly harmless or unlikely to change protein behavior"
7004                            ],
7005                        },
7006                        {
7007                            "type": "contains",
7008                            "value": "MODIFIER",
7009                            "score": 0,
7010                            "flag": "FILTERED",
7011                            "comment": [
7012                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
7013                            ],
7014                        },
7015                    ],
7016                }
7017            },
7018        }
7019
7020        return config_default.get(name, None)

The function get_config_default returns a dictionary containing default configurations for various calculations and prioritizations.

Parameters
  • name: The get_config_default function returns a dictionary containing default configurations for different calculations and prioritizations. The name parameter is used to specify which specific configuration to retrieve from the dictionary
Returns

The function get_config_default returns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the input name parameter provided to the function. If the name parameter matches a key in the config_default dictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.

def get_config_json(self, name: str, config_dict: dict = {}, config_file: str = None) -> dict:
7022    def get_config_json(
7023        self, name: str, config_dict: dict = {}, config_file: str = None
7024    ) -> dict:
7025        """
7026        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
7027        default values, a dictionary, and a file.
7028
7029        :param name: The `name` parameter in the `get_config_json` function is a string that represents
7030        the name of the configuration. It is used to identify and retrieve the configuration settings
7031        for a specific component or module
7032        :type name: str
7033        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
7034        dictionary that allows you to provide additional configuration settings or overrides. When you
7035        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
7036        the key is the configuration setting you want to override or
7037        :type config_dict: dict
7038        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
7039        specify the path to a configuration file that contains additional settings. If provided, the
7040        function will read the contents of this file and update the configuration dictionary with the
7041        values found in the file, overriding any existing values with the
7042        :type config_file: str
7043        :return: The function `get_config_json` returns a dictionary containing the configuration
7044        settings.
7045        """
7046
7047        # Create with default prioritizations
7048        config_default = self.get_config_default(name=name)
7049        configuration = config_default
7050        # log.debug(f"configuration={configuration}")
7051
7052        # Replace prioritizations from dict
7053        for config in config_dict:
7054            configuration[config] = config_dict[config]
7055
7056        # Replace prioritizations from file
7057        config_file = full_path(config_file)
7058        if config_file:
7059            if os.path.exists(config_file):
7060                with open(config_file) as config_file_content:
7061                    config_file_dict = yaml.safe_load(config_file_content)
7062                for config in config_file_dict:
7063                    configuration[config] = config_file_dict[config]
7064            else:
7065                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
7066                log.error(msg_error)
7067                raise ValueError(msg_error)
7068
7069        return configuration

The function get_config_json retrieves a configuration JSON object with prioritizations from default values, a dictionary, and a file.

Parameters
  • name: The name parameter in the get_config_json function is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module
  • config_dict: The config_dict parameter in the get_config_json function is a dictionary that allows you to provide additional configuration settings or overrides. When you call the get_config_json function, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or
  • config_file: The config_file parameter in the get_config_json function is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns

The function get_config_json returns a dictionary containing the configuration settings.

def prioritization( self, table: str = None, pz_prefix: str = None, pz_param: dict = None) -> bool:
7071    def prioritization(
7072        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
7073    ) -> bool:
7074        """
7075        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
7076        prioritizes variants based on configured profiles and criteria.
7077
7078        :param table: The `table` parameter in the `prioritization` function is used to specify the name
7079        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
7080        a table name is provided, the method will prioritize the variants in that specific table
7081        :type table: str
7082        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
7083        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
7084        provided, the code will use a default prefix value of "PZ"
7085        :type pz_prefix: str
7086        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
7087        additional parameters specific to the prioritization process. These parameters can include
7088        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
7089        configurations needed for the prioritization of variants in a V
7090        :type pz_param: dict
7091        :return: A boolean value (True) is being returned from the `prioritization` function.
7092        """
7093
7094        # Config
7095        config = self.get_config()
7096
7097        # Param
7098        param = self.get_param()
7099
7100        # Prioritization param
7101        if pz_param is not None:
7102            prioritization_param = pz_param
7103        else:
7104            prioritization_param = param.get("prioritization", {})
7105
7106        # Configuration profiles
7107        prioritization_config_file = prioritization_param.get(
7108            "prioritization_config", None
7109        )
7110        prioritization_config_file = full_path(prioritization_config_file)
7111        prioritizations_config = self.get_config_json(
7112            name="prioritizations", config_file=prioritization_config_file
7113        )
7114
7115        # Prioritization prefix
7116        pz_prefix_default = "PZ"
7117        if pz_prefix is None:
7118            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
7119
7120        # Prioritization options
7121        profiles = prioritization_param.get("profiles", [])
7122        if isinstance(profiles, str):
7123            profiles = profiles.split(",")
7124        pzfields = prioritization_param.get(
7125            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
7126        )
7127        if isinstance(pzfields, str):
7128            pzfields = pzfields.split(",")
7129        default_profile = prioritization_param.get("default_profile", None)
7130        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
7131        prioritization_score_mode = prioritization_param.get(
7132            "prioritization_score_mode", "HOWARD"
7133        )
7134
7135        # Quick Prioritizations
7136        prioritizations = param.get("prioritizations", None)
7137        if prioritizations:
7138            log.info("Quick Prioritization:")
7139            for profile in prioritizations.split(","):
7140                if profile not in profiles:
7141                    profiles.append(profile)
7142                    log.info(f"   {profile}")
7143
7144        # If profile "ALL" provided, all profiles in the config profiles
7145        if "ALL" in profiles:
7146            profiles = list(prioritizations_config.keys())
7147
7148        for profile in profiles:
7149            if prioritizations_config.get(profile, None):
7150                log.debug(f"Profile '{profile}' configured")
7151            else:
7152                msg_error = f"Profile '{profile}' NOT configured"
7153                log.error(msg_error)
7154                raise ValueError(msg_error)
7155
7156        if profiles:
7157            log.info(f"Prioritization... ")
7158        else:
7159            log.debug(f"No profile defined")
7160            return False
7161
7162        if not default_profile and len(profiles):
7163            default_profile = profiles[0]
7164
7165        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
7166        log.debug("Profiles to check: " + str(list(profiles)))
7167
7168        # Variables
7169        if table is not None:
7170            table_variants = table
7171        else:
7172            table_variants = self.get_table_variants(clause="update")
7173        log.debug(f"Table to prioritize: {table_variants}")
7174
7175        # Added columns
7176        added_columns = []
7177
7178        # Create list of PZfields
7179        # List of PZFields
7180        list_of_pzfields_original = pzfields + [
7181            pzfield + pzfields_sep + profile
7182            for pzfield in pzfields
7183            for profile in profiles
7184        ]
7185        list_of_pzfields = []
7186        log.debug(f"{list_of_pzfields_original}")
7187
7188        # Remove existing PZfields to use if exists
7189        for pzfield in list_of_pzfields_original:
7190            if self.get_header().infos.get(pzfield, None) is None:
7191                list_of_pzfields.append(pzfield)
7192                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
7193            else:
7194                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
7195
7196        if list_of_pzfields:
7197
7198            # Explode Infos prefix
7199            explode_infos_prefix = self.get_explode_infos_prefix()
7200
7201            # PZfields tags description
7202            PZfields_INFOS = {
7203                f"{pz_prefix}Tags": {
7204                    "ID": f"{pz_prefix}Tags",
7205                    "Number": ".",
7206                    "Type": "String",
7207                    "Description": "Variant tags based on annotation criteria",
7208                },
7209                f"{pz_prefix}Score": {
7210                    "ID": f"{pz_prefix}Score",
7211                    "Number": 1,
7212                    "Type": "Integer",
7213                    "Description": "Variant score based on annotation criteria",
7214                },
7215                f"{pz_prefix}Flag": {
7216                    "ID": f"{pz_prefix}Flag",
7217                    "Number": 1,
7218                    "Type": "String",
7219                    "Description": "Variant flag based on annotation criteria",
7220                },
7221                f"{pz_prefix}Comment": {
7222                    "ID": f"{pz_prefix}Comment",
7223                    "Number": ".",
7224                    "Type": "String",
7225                    "Description": "Variant comment based on annotation criteria",
7226                },
7227                f"{pz_prefix}Infos": {
7228                    "ID": f"{pz_prefix}Infos",
7229                    "Number": ".",
7230                    "Type": "String",
7231                    "Description": "Variant infos based on annotation criteria",
7232                },
7233                f"{pz_prefix}Class": {
7234                    "ID": f"{pz_prefix}Class",
7235                    "Number": ".",
7236                    "Type": "String",
7237                    "Description": "Variant class based on annotation criteria",
7238                },
7239            }
7240
7241            # Create INFO fields if not exist
7242            for field in PZfields_INFOS:
7243                field_ID = PZfields_INFOS[field]["ID"]
7244                field_description = PZfields_INFOS[field]["Description"]
7245                if field_ID not in self.get_header().infos and field_ID in pzfields:
7246                    field_description = (
7247                        PZfields_INFOS[field]["Description"]
7248                        + f", profile {default_profile}"
7249                    )
7250                    self.get_header().infos[field_ID] = vcf.parser._Info(
7251                        field_ID,
7252                        PZfields_INFOS[field]["Number"],
7253                        PZfields_INFOS[field]["Type"],
7254                        field_description,
7255                        "unknown",
7256                        "unknown",
7257                        code_type_map[PZfields_INFOS[field]["Type"]],
7258                    )
7259
7260            # Create INFO fields if not exist for each profile
7261            for profile in prioritizations_config:
7262                if profile in profiles or profiles == []:
7263                    for field in PZfields_INFOS:
7264                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
7265                        field_description = (
7266                            PZfields_INFOS[field]["Description"]
7267                            + f", profile {profile}"
7268                        )
7269                        if (
7270                            field_ID not in self.get_header().infos
7271                            and field in pzfields
7272                        ):
7273                            self.get_header().infos[field_ID] = vcf.parser._Info(
7274                                field_ID,
7275                                PZfields_INFOS[field]["Number"],
7276                                PZfields_INFOS[field]["Type"],
7277                                field_description,
7278                                "unknown",
7279                                "unknown",
7280                                code_type_map[PZfields_INFOS[field]["Type"]],
7281                            )
7282
7283            # Header
7284            for pzfield in list_of_pzfields:
7285                if re.match(f"{pz_prefix}Score.*", pzfield):
7286                    added_column = self.add_column(
7287                        table_name=table_variants,
7288                        column_name=pzfield,
7289                        column_type="INTEGER",
7290                        default_value="0",
7291                    )
7292                elif re.match(f"{pz_prefix}Flag.*", pzfield):
7293                    added_column = self.add_column(
7294                        table_name=table_variants,
7295                        column_name=pzfield,
7296                        column_type="BOOLEAN",
7297                        default_value="1",
7298                    )
7299                elif re.match(f"{pz_prefix}Class.*", pzfield):
7300                    added_column = self.add_column(
7301                        table_name=table_variants,
7302                        column_name=pzfield,
7303                        column_type="VARCHAR[]",
7304                        default_value="null",
7305                    )
7306                else:
7307                    added_column = self.add_column(
7308                        table_name=table_variants,
7309                        column_name=pzfield,
7310                        column_type="STRING",
7311                        default_value="''",
7312                    )
7313                added_columns.append(added_column)
7314
7315            # Profiles
7316            if profiles:
7317
7318                # foreach profile in configuration file
7319                for profile in prioritizations_config:
7320
7321                    # If profile is asked in param, or ALL are asked (empty profile [])
7322                    if profile in profiles or profiles == []:
7323                        log.info(f"Profile '{profile}'")
7324
7325                        sql_set_info_option = ""
7326
7327                        sql_set_info = []
7328
7329                        # PZ fields set
7330
7331                        # PZScore
7332                        if (
7333                            f"{pz_prefix}Score{pzfields_sep}{profile}"
7334                            in list_of_pzfields
7335                        ):
7336                            sql_set_info.append(
7337                                f"""
7338                                    concat(
7339                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
7340                                        {pz_prefix}Score{pzfields_sep}{profile}
7341                                    ) 
7342                                """
7343                            )
7344                            if (
7345                                profile == default_profile
7346                                and f"{pz_prefix}Score" in list_of_pzfields
7347                            ):
7348                                sql_set_info.append(
7349                                    f"""
7350                                        concat(
7351                                            '{pz_prefix}Score=',
7352                                            {pz_prefix}Score{pzfields_sep}{profile}
7353                                        )
7354                                    """
7355                                )
7356
7357                        # PZFlag
7358                        if (
7359                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
7360                            in list_of_pzfields
7361                        ):
7362                            sql_set_info.append(
7363                                f"""
7364                                    concat(
7365                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
7366                                        CASE 
7367                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
7368                                            THEN 'PASS'
7369                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
7370                                            THEN 'FILTERED'
7371                                        END
7372                                    ) 
7373                                """
7374                            )
7375                            if (
7376                                profile == default_profile
7377                                and f"{pz_prefix}Flag" in list_of_pzfields
7378                            ):
7379                                sql_set_info.append(
7380                                    f"""
7381                                        concat(
7382                                            '{pz_prefix}Flag=',
7383                                            CASE 
7384                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
7385                                                THEN 'PASS'
7386                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
7387                                                THEN 'FILTERED'
7388                                            END
7389                                        )
7390                                    """
7391                                )
7392
7393                        # PZClass
7394                        if (
7395                            f"{pz_prefix}Class{pzfields_sep}{profile}"
7396                            in list_of_pzfields
7397                        ):
7398                            sql_set_info.append(
7399                                f"""
7400                                    concat(
7401                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
7402                                        CASE
7403                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7404                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7405                                            ELSE '.'
7406                                        END 
7407                                    )
7408                                    
7409                                """
7410                            )
7411                            if (
7412                                profile == default_profile
7413                                and f"{pz_prefix}Class" in list_of_pzfields
7414                            ):
7415                                sql_set_info.append(
7416                                    f"""
7417                                        concat(
7418                                            '{pz_prefix}Class=',
7419                                            CASE
7420                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7421                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7422                                                ELSE '.'
7423                                            END 
7424                                        )
7425                                    """
7426                                )
7427
7428                        # PZComment
7429                        if (
7430                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
7431                            in list_of_pzfields
7432                        ):
7433                            sql_set_info.append(
7434                                f"""
7435                                    CASE
7436                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
7437                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
7438                                        ELSE ''
7439                                    END
7440                                """
7441                            )
7442                            if (
7443                                profile == default_profile
7444                                and f"{pz_prefix}Comment" in list_of_pzfields
7445                            ):
7446                                sql_set_info.append(
7447                                    f"""
7448                                        CASE
7449                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
7450                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
7451                                            ELSE ''
7452                                        END
7453                                    """
7454                                )
7455
7456                        # PZInfos
7457                        if (
7458                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
7459                            in list_of_pzfields
7460                        ):
7461                            sql_set_info.append(
7462                                f"""
7463                                    CASE
7464                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
7465                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
7466                                        ELSE ''
7467                                    END
7468                                """
7469                            )
7470                            if (
7471                                profile == default_profile
7472                                and f"{pz_prefix}Infos" in list_of_pzfields
7473                            ):
7474                                sql_set_info.append(
7475                                    f"""
7476                                        CASE
7477                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
7478                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
7479                                            ELSE ''
7480                                        END
7481                                    """
7482                                )
7483
7484                        # Merge PZfields
7485                        sql_set_info_option = ""
7486                        sql_set_sep = ""
7487                        for sql_set in sql_set_info:
7488                            if sql_set_sep:
7489                                sql_set_info_option += f"""
7490                                    , concat('{sql_set_sep}', {sql_set})
7491                                """
7492                            else:
7493                                sql_set_info_option += f"""
7494                                    , {sql_set}
7495                                """
7496                            sql_set_sep = ";"
7497
7498                        sql_queries = []
7499                        for annotation in prioritizations_config[profile]:
7500
7501                            # skip special sections
7502                            if annotation.startswith("_"):
7503                                continue
7504
7505                            # For each criterions
7506                            for criterion in prioritizations_config[profile][
7507                                annotation
7508                            ]:
7509
7510                                # Criterion mode
7511                                criterion_mode = None
7512                                if np.any(
7513                                    np.isin(list(criterion.keys()), ["type", "value"])
7514                                ):
7515                                    criterion_mode = "operation"
7516                                elif np.any(
7517                                    np.isin(list(criterion.keys()), ["sql", "fields"])
7518                                ):
7519                                    criterion_mode = "sql"
7520                                log.debug(f"Criterion Mode: {criterion_mode}")
7521
7522                                # Criterion parameters
7523                                criterion_type = criterion.get("type", None)
7524                                criterion_value = criterion.get("value", None)
7525                                criterion_sql = criterion.get("sql", None)
7526                                criterion_fields = criterion.get("fields", None)
7527                                criterion_score = criterion.get("score", 0)
7528                                criterion_flag = criterion.get("flag", "PASS")
7529                                criterion_class = criterion.get("class", None)
7530                                criterion_flag_bool = criterion_flag == "PASS"
7531                                criterion_comment = (
7532                                    ", ".join(criterion.get("comment", []))
7533                                    .replace("'", "''")
7534                                    .replace(";", ",")
7535                                    .replace("\t", " ")
7536                                )
7537                                criterion_infos = (
7538                                    str(criterion)
7539                                    .replace("'", "''")
7540                                    .replace(";", ",")
7541                                    .replace("\t", " ")
7542                                )
7543
7544                                # SQL
7545                                if criterion_sql is not None and isinstance(
7546                                    criterion_sql, list
7547                                ):
7548                                    criterion_sql = " ".join(criterion_sql)
7549
7550                                # Fields and explode
7551                                if criterion_fields is None:
7552                                    criterion_fields = [annotation]
7553                                if not isinstance(criterion_fields, list):
7554                                    criterion_fields = str(criterion_fields).split(",")
7555
7556                                # Class
7557                                if criterion_class is not None and not isinstance(
7558                                    criterion_class, list
7559                                ):
7560                                    criterion_class = str(criterion_class).split(",")
7561
7562                                for annotation_field in criterion_fields:
7563
7564                                    # Explode specific annotation
7565                                    log.debug(
7566                                        f"Explode annotation '{annotation_field}'"
7567                                    )
7568                                    added_columns += self.explode_infos(
7569                                        prefix=explode_infos_prefix,
7570                                        fields=[annotation_field],
7571                                        table=table_variants,
7572                                    )
7573                                    extra_infos = self.get_extra_infos(
7574                                        table=table_variants
7575                                    )
7576
7577                                    # Check if annotation field is present
7578                                    if (
7579                                        f"{explode_infos_prefix}{annotation_field}"
7580                                        not in extra_infos
7581                                    ):
7582                                        msq_err = f"Annotation '{annotation_field}' not in data"
7583                                        log.error(msq_err)
7584                                        raise ValueError(msq_err)
7585                                    else:
7586                                        log.debug(
7587                                            f"Annotation '{annotation_field}' in data"
7588                                        )
7589
7590                                sql_set = []
7591                                sql_set_info = []
7592
7593                                # PZ fields set
7594
7595                                # PZScore
7596                                if (
7597                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
7598                                    in list_of_pzfields
7599                                ):
7600                                    # VaRank prioritization score mode
7601                                    if prioritization_score_mode.upper().strip() in ["VARANK", "MAX", "MAXIMUM", "TOP"]:
7602                                        sql_set.append(
7603                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END "
7604                                        )
7605                                    # default HOWARD prioritization score mode
7606                                    else:
7607                                        sql_set.append(
7608                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
7609                                        )
7610
7611                                # PZFlag
7612                                if (
7613                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
7614                                    in list_of_pzfields
7615                                ):
7616                                    sql_set.append(
7617                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
7618                                    )
7619
7620                                # PZClass
7621                                if (
7622                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
7623                                    in list_of_pzfields
7624                                    and criterion_class is not None
7625                                ):
7626                                    sql_set.append(
7627                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
7628                                    )
7629
7630                                # PZComment
7631                                if (
7632                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
7633                                    in list_of_pzfields
7634                                ):
7635                                    sql_set.append(
7636                                        f"""
7637                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
7638                                                concat(
7639                                                    {pz_prefix}Comment{pzfields_sep}{profile},
7640                                                    CASE 
7641                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
7642                                                        THEN ', '
7643                                                        ELSE ''
7644                                                    END,
7645                                                    '{criterion_comment}'
7646                                                )
7647                                        """
7648                                    )
7649
7650                                # PZInfos
7651                                if (
7652                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
7653                                    in list_of_pzfields
7654                                ):
7655                                    sql_set.append(
7656                                        f"""
7657                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
7658                                                concat(
7659                                                    {pz_prefix}Infos{pzfields_sep}{profile},
7660                                                    '{criterion_infos}'
7661                                                )
7662                                        """
7663                                    )
7664                                sql_set_option = ",".join(sql_set)
7665
7666                                # Criterion and comparison
7667                                if sql_set_option:
7668
7669                                    if criterion_mode in ["operation"]:
7670
7671                                        try:
7672                                            float(criterion_value)
7673                                            sql_update = f"""
7674                                                UPDATE {table_variants}
7675                                                SET {sql_set_option}
7676                                                WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
7677                                                AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
7678                                            """
7679                                        except:
7680                                            contains_option = ""
7681                                            if criterion_type == "contains":
7682                                                contains_option = ".*"
7683                                            sql_update = f"""
7684                                                UPDATE {table_variants}
7685                                                SET {sql_set_option}
7686                                                WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
7687                                            """
7688                                        sql_queries.append(sql_update)
7689
7690                                    elif criterion_mode in ["sql"]:
7691
7692                                        sql_update = f"""
7693                                            UPDATE {table_variants}
7694                                            SET {sql_set_option}
7695                                            WHERE {criterion_sql}
7696                                        """
7697                                        sql_queries.append(sql_update)
7698
7699                                    else:
7700                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
7701                                        log.error(msg_err)
7702                                        raise ValueError(msg_err)
7703
7704                                else:
7705                                    log.warning(
7706                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
7707                                    )
7708
7709                        # PZTags
7710                        if (
7711                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
7712                            in list_of_pzfields
7713                        ):
7714
7715                            # Create PZFalgs value
7716                            pztags_value = ""
7717                            pztags_sep_default = ","
7718                            pztags_sep = ""
7719                            for pzfield in pzfields:
7720                                if pzfield not in [f"{pz_prefix}Tags"]:
7721                                    if (
7722                                        f"{pzfield}{pzfields_sep}{profile}"
7723                                        in list_of_pzfields
7724                                    ):
7725                                        if pzfield in [f"{pz_prefix}Flag"]:
7726                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7727                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
7728                                                    THEN 'PASS'
7729                                                    ELSE 'FILTERED'
7730                                                END, '"""
7731                                        elif pzfield in [f"{pz_prefix}Class"]:
7732                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7733                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7734                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7735                                                    ELSE '.'
7736                                                END, '"""
7737                                        else:
7738                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
7739                                        pztags_sep = pztags_sep_default
7740
7741                            # Add Query update for PZFlags
7742                            sql_update_pztags = f"""
7743                                UPDATE {table_variants}
7744                                SET INFO = concat(
7745                                        INFO,
7746                                        CASE WHEN INFO NOT in ('','.')
7747                                                THEN ';'
7748                                                ELSE ''
7749                                        END,
7750                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
7751                                    )
7752                                """
7753                            sql_queries.append(sql_update_pztags)
7754
7755                            # Add Query update for PZFlags for default
7756                            if profile == default_profile:
7757                                sql_update_pztags_default = f"""
7758                                UPDATE {table_variants}
7759                                SET INFO = concat(
7760                                        INFO,
7761                                        ';',
7762                                        '{pz_prefix}Tags={pztags_value}'
7763                                    )
7764                                """
7765                                sql_queries.append(sql_update_pztags_default)
7766
7767                        log.info(f"""Profile '{profile}' - Prioritization... """)
7768
7769                        if sql_queries:
7770
7771                            for sql_query in sql_queries:
7772                                log.debug(
7773                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
7774                                )
7775                                self.conn.execute(sql_query)
7776
7777                        log.info(f"""Profile '{profile}' - Update... """)
7778                        sql_query_update = f"""
7779                            UPDATE {table_variants}
7780                            SET INFO =  
7781                                concat(
7782                                    CASE
7783                                        WHEN INFO NOT IN ('','.')
7784                                        THEN concat(INFO, ';')
7785                                        ELSE ''
7786                                    END
7787                                    {sql_set_info_option}
7788                                )
7789                        """
7790                        self.conn.execute(sql_query_update)
7791
7792        else:
7793
7794            log.warning(f"No profiles in parameters")
7795
7796        # Remove added columns
7797        for added_column in added_columns:
7798            self.drop_column(column=added_column)
7799
7800        # Explode INFOS fields into table fields
7801        if self.get_explode_infos():
7802            self.explode_infos(
7803                prefix=self.get_explode_infos_prefix(),
7804                fields=self.get_explode_infos_fields(),
7805                force=True,
7806            )
7807
7808        return True

The prioritization function in Python processes VCF files, adds new INFO fields, and prioritizes variants based on configured profiles and criteria.

Parameters
  • table: The table parameter in the prioritization function is used to specify the name of the table (presumably a VCF file) on which the prioritization operation will be performed. If a table name is provided, the method will prioritize the variants in that specific table
  • pz_prefix: The pz_prefix parameter is used to specify a prefix that will be added to certain INFO fields in a VCF file during the prioritization process. If this parameter is not provided, the code will use a default prefix value of "PZ"
  • pz_param: The pz_param parameter in the prioritization method is used to pass additional parameters specific to the prioritization process. These parameters can include settings related to prioritization profiles, fields, scoring modes, flags, comments, and other configurations needed for the prioritization of variants in a V
Returns

A boolean value (True) is being returned from the prioritization function.

def annotation_hgvs(self, threads: int = None) -> None:
7814    def annotation_hgvs(self, threads: int = None) -> None:
7815        """
7816        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
7817        coordinates and alleles.
7818
7819        :param threads: The `threads` parameter is an optional integer that specifies the number of
7820        threads to use for parallel processing. If no value is provided, it will default to the number
7821        of threads obtained from the `get_threads()` method
7822        :type threads: int
7823        """
7824
7825        # Function for each partition of the Dask Dataframe
7826        def partition_function(partition):
7827            """
7828            The function `partition_function` applies the `annotation_hgvs_partition` function to
7829            each row of a DataFrame called `partition`.
7830
7831            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
7832            to be processed
7833            :return: the result of applying the "annotation_hgvs_partition" function to each row of
7834            the "partition" dataframe along the axis 1.
7835            """
7836            return partition.apply(annotation_hgvs_partition, axis=1)
7837
7838        def annotation_hgvs_partition(row) -> str:
7839            """
7840            The function `annotation_hgvs_partition` takes in a row of data and returns a string
7841            containing a list of HGVS names associated with the given genomic coordinates and alleles.
7842
7843            :param row: A dictionary-like object that contains the values for the following keys:
7844            :return: a string that contains the HGVS names associated with the given row of data.
7845            """
7846
7847            chr = row["CHROM"]
7848            pos = row["POS"]
7849            ref = row["REF"]
7850            alt = row["ALT"]
7851
7852            # Find list of associated transcripts
7853            transcripts_list = list(
7854                polars_conn.execute(
7855                    f"""
7856                SELECT transcript
7857                FROM refseq_df
7858                WHERE CHROM='{chr}'
7859                AND POS={pos}
7860            """
7861                )["transcript"]
7862            )
7863
7864            # Full HGVS annotation in list
7865            hgvs_full_list = []
7866
7867            for transcript_name in transcripts_list:
7868
7869                # Transcript
7870                transcript = get_transcript(
7871                    transcripts=transcripts, transcript_name=transcript_name
7872                )
7873                # Exon
7874                if use_exon:
7875                    exon = transcript.find_exon_number(pos)
7876                else:
7877                    exon = None
7878                # Protein
7879                transcript_protein = None
7880                if use_protein or add_protein or full_format:
7881                    transcripts_protein = list(
7882                        polars_conn.execute(
7883                            f"""
7884                        SELECT protein
7885                        FROM refseqlink_df
7886                        WHERE transcript='{transcript_name}'
7887                        LIMIT 1
7888                    """
7889                        )["protein"]
7890                    )
7891                    if len(transcripts_protein):
7892                        transcript_protein = transcripts_protein[0]
7893
7894                # HGVS name
7895                hgvs_name = format_hgvs_name(
7896                    chr,
7897                    pos,
7898                    ref,
7899                    alt,
7900                    genome=genome,
7901                    transcript=transcript,
7902                    transcript_protein=transcript_protein,
7903                    exon=exon,
7904                    use_gene=use_gene,
7905                    use_protein=use_protein,
7906                    full_format=full_format,
7907                    use_version=use_version,
7908                    codon_type=codon_type,
7909                )
7910                hgvs_full_list.append(hgvs_name)
7911                if add_protein and not use_protein and not full_format:
7912                    hgvs_name = format_hgvs_name(
7913                        chr,
7914                        pos,
7915                        ref,
7916                        alt,
7917                        genome=genome,
7918                        transcript=transcript,
7919                        transcript_protein=transcript_protein,
7920                        exon=exon,
7921                        use_gene=use_gene,
7922                        use_protein=True,
7923                        full_format=False,
7924                        use_version=use_version,
7925                        codon_type=codon_type,
7926                    )
7927                    hgvs_full_list.append(hgvs_name)
7928
7929            # Create liste of HGVS annotations
7930            hgvs_full = ",".join(hgvs_full_list)
7931
7932            return hgvs_full
7933
7934        # Polars connexion
7935        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7936
7937        # Config
7938        config = self.get_config()
7939
7940        # Databases
7941        # Genome
7942        databases_genomes_folders = (
7943            config.get("folders", {})
7944            .get("databases", {})
7945            .get("genomes", DEFAULT_GENOME_FOLDER)
7946        )
7947        databases_genome = (
7948            config.get("folders", {}).get("databases", {}).get("genomes", "")
7949        )
7950        # refseq database folder
7951        databases_refseq_folders = (
7952            config.get("folders", {})
7953            .get("databases", {})
7954            .get("refseq", DEFAULT_REFSEQ_FOLDER)
7955        )
7956        # refseq
7957        databases_refseq = config.get("databases", {}).get("refSeq", None)
7958        # refSeqLink
7959        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
7960
7961        # Param
7962        param = self.get_param()
7963
7964        # Quick HGVS
7965        if "hgvs_options" in param and param.get("hgvs_options", ""):
7966            log.info(f"Quick HGVS Annotation:")
7967            if not param.get("hgvs", None):
7968                param["hgvs"] = {}
7969            for option in param.get("hgvs_options", "").split(","):
7970                option_var_val = option.split("=")
7971                option_var = option_var_val[0]
7972                if len(option_var_val) > 1:
7973                    option_val = option_var_val[1]
7974                else:
7975                    option_val = "True"
7976                if option_val.upper() in ["TRUE"]:
7977                    option_val = True
7978                elif option_val.upper() in ["FALSE"]:
7979                    option_val = False
7980                log.info(f"   {option_var}={option_val}")
7981                param["hgvs"][option_var] = option_val
7982
7983        # Check if HGVS annotation enabled
7984        if "hgvs" in param:
7985            log.info(f"HGVS Annotation... ")
7986            for hgvs_option in param.get("hgvs", {}):
7987                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
7988        else:
7989            return
7990
7991        # HGVS Param
7992        param_hgvs = param.get("hgvs", {})
7993        use_exon = param_hgvs.get("use_exon", False)
7994        use_gene = param_hgvs.get("use_gene", False)
7995        use_protein = param_hgvs.get("use_protein", False)
7996        add_protein = param_hgvs.get("add_protein", False)
7997        full_format = param_hgvs.get("full_format", False)
7998        use_version = param_hgvs.get("use_version", False)
7999        codon_type = param_hgvs.get("codon_type", "3")
8000
8001        # refSseq refSeqLink
8002        databases_refseq = param_hgvs.get("refseq", databases_refseq)
8003        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
8004
8005        # Assembly
8006        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
8007
8008        # Genome
8009        genome_file = None
8010        if find_genome(databases_genome):
8011            genome_file = find_genome(databases_genome)
8012        else:
8013            genome_file = find_genome(
8014                genome_path=databases_genomes_folders, assembly=assembly
8015            )
8016        log.debug("Genome: " + str(genome_file))
8017
8018        # refSseq
8019        refseq_file = find_file_prefix(
8020            input_file=databases_refseq,
8021            prefix="ncbiRefSeq",
8022            folder=databases_refseq_folders,
8023            assembly=assembly,
8024        )
8025        log.debug("refSeq: " + str(refseq_file))
8026
8027        # refSeqLink
8028        refseqlink_file = find_file_prefix(
8029            input_file=databases_refseqlink,
8030            prefix="ncbiRefSeqLink",
8031            folder=databases_refseq_folders,
8032            assembly=assembly,
8033        )
8034        log.debug("refSeqLink: " + str(refseqlink_file))
8035
8036        # Threads
8037        if not threads:
8038            threads = self.get_threads()
8039        log.debug("Threads: " + str(threads))
8040
8041        # Variables
8042        table_variants = self.get_table_variants(clause="update")
8043
8044        # Get variants SNV and InDel only
8045        query_variants = f"""
8046            SELECT "#CHROM" AS CHROM, POS, REF, ALT
8047            FROM {table_variants}
8048            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
8049            """
8050        df_variants = self.get_query_to_df(query_variants)
8051
8052        # Added columns
8053        added_columns = []
8054
8055        # Add hgvs column in variants table
8056        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
8057        added_column = self.add_column(
8058            table_variants, hgvs_column_name, "STRING", default_value=None
8059        )
8060        added_columns.append(added_column)
8061
8062        log.debug(f"refSeq loading...")
8063        # refSeq in duckDB
8064        refseq_table = get_refseq_table(
8065            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
8066        )
8067        # Loading all refSeq in Dataframe
8068        refseq_query = f"""
8069            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
8070            FROM {refseq_table}
8071            JOIN df_variants ON (
8072                {refseq_table}.chrom = df_variants.CHROM
8073                AND {refseq_table}.txStart<=df_variants.POS
8074                AND {refseq_table}.txEnd>=df_variants.POS
8075            )
8076        """
8077        refseq_df = self.conn.query(refseq_query).pl()
8078
8079        if refseqlink_file:
8080            log.debug(f"refSeqLink loading...")
8081            # refSeqLink in duckDB
8082            refseqlink_table = get_refseq_table(
8083                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
8084            )
8085            # Loading all refSeqLink in Dataframe
8086            protacc_column = "protAcc_with_ver"
8087            mrnaacc_column = "mrnaAcc_with_ver"
8088            refseqlink_query = f"""
8089                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
8090                FROM {refseqlink_table} 
8091                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
8092                WHERE protAcc_without_ver IS NOT NULL
8093            """
8094            # Polars Dataframe
8095            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
8096
8097        # Read RefSeq transcripts into a python dict/model.
8098        log.debug(f"Transcripts loading...")
8099        with tempfile.TemporaryDirectory() as tmpdir:
8100            transcripts_query = f"""
8101                COPY (
8102                    SELECT {refseq_table}.*
8103                    FROM {refseq_table}
8104                    JOIN df_variants ON (
8105                        {refseq_table}.chrom=df_variants.CHROM
8106                        AND {refseq_table}.txStart<=df_variants.POS
8107                        AND {refseq_table}.txEnd>=df_variants.POS
8108                    )
8109                )
8110                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
8111            """
8112            self.conn.query(transcripts_query)
8113            with open(f"{tmpdir}/transcript.tsv") as infile:
8114                transcripts = read_transcripts(infile)
8115
8116        # Polars connexion
8117        polars_conn = pl.SQLContext(register_globals=True, eager=True)
8118
8119        log.debug("Genome loading...")
8120        # Read genome sequence using pyfaidx.
8121        genome = Fasta(genome_file)
8122
8123        log.debug("Start annotation HGVS...")
8124
8125        # Create
8126        # a Dask Dataframe from Pandas dataframe with partition as number of threads
8127        ddf = dd.from_pandas(df_variants, npartitions=threads)
8128
8129        # Use dask.dataframe.apply() to apply function on each partition
8130        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
8131
8132        # Convert Dask DataFrame to Pandas Dataframe
8133        df = ddf.compute()
8134
8135        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
8136        with tempfile.TemporaryDirectory() as tmpdir:
8137            df_parquet = os.path.join(tmpdir, "df.parquet")
8138            df.to_parquet(df_parquet)
8139
8140            # Update hgvs column
8141            update_variant_query = f"""
8142                UPDATE {table_variants}
8143                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
8144                FROM read_parquet('{df_parquet}') as df
8145                WHERE variants."#CHROM" = df.CHROM
8146                AND variants.POS = df.POS
8147                AND variants.REF = df.REF
8148                AND variants.ALT = df.ALT
8149                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
8150                """
8151            self.execute_query(update_variant_query)
8152
8153        # Update INFO column
8154        sql_query_update = f"""
8155            UPDATE {table_variants}
8156            SET INFO = 
8157                concat(
8158                    CASE 
8159                        WHEN INFO NOT IN ('','.')
8160                        THEN concat(INFO, ';')
8161                        ELSE ''
8162                    END,
8163                    'hgvs=',
8164                    {hgvs_column_name}
8165                )
8166            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
8167            """
8168        self.execute_query(sql_query_update)
8169
8170        # Add header
8171        HGVS_INFOS = {
8172            "hgvs": {
8173                "ID": "hgvs",
8174                "Number": ".",
8175                "Type": "String",
8176                "Description": f"HGVS annotatation with HOWARD",
8177            }
8178        }
8179
8180        for field in HGVS_INFOS:
8181            field_ID = HGVS_INFOS[field]["ID"]
8182            field_description = HGVS_INFOS[field]["Description"]
8183            self.get_header().infos[field_ID] = vcf.parser._Info(
8184                field_ID,
8185                HGVS_INFOS[field]["Number"],
8186                HGVS_INFOS[field]["Type"],
8187                field_description,
8188                "unknown",
8189                "unknown",
8190                code_type_map[HGVS_INFOS[field]["Type"]],
8191            )
8192
8193        # Remove added columns
8194        for added_column in added_columns:
8195            self.drop_column(column=added_column)

The annotation_hgvs function performs HGVS annotation on a set of variants using genomic coordinates and alleles.

Parameters
  • threads: The threads parameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from the get_threads() method
def get_operations_help( self, operations_config_dict: dict = {}, operations_config_file: str = None) -> list:
8201    def get_operations_help(
8202        self, operations_config_dict: dict = {}, operations_config_file: str = None
8203    ) -> list:
8204
8205        # Init
8206        operations_help = []
8207
8208        # operations
8209        operations = self.get_config_json(
8210            name="calculations",
8211            config_dict=operations_config_dict,
8212            config_file=operations_config_file,
8213        )
8214        for op in operations:
8215            op_name = operations[op].get("name", op).upper()
8216            op_description = operations[op].get("description", op_name)
8217            op_available = operations[op].get("available", False)
8218            if op_available:
8219                operations_help.append(f"   {op_name}: {op_description}")
8220
8221        # Sort operations
8222        operations_help.sort()
8223
8224        # insert header
8225        operations_help.insert(0, "Available calculation operations:")
8226
8227        # Return
8228        return operations_help
def calculation( self, operations: dict = {}, operations_config_dict: dict = {}, operations_config_file: str = None) -> None:
8230    def calculation(
8231        self,
8232        operations: dict = {},
8233        operations_config_dict: dict = {},
8234        operations_config_file: str = None,
8235    ) -> None:
8236        """
8237        It takes a list of operations, and for each operation, it checks if it's a python or sql
8238        operation, and then calls the appropriate function
8239
8240        param json example:
8241            "calculation": {
8242                "NOMEN": {
8243                    "options": {
8244                        "hgvs_field": "hgvs"
8245                    },
8246                "middle" : null
8247            }
8248        """
8249
8250        # Param
8251        param = self.get_param()
8252
8253        # CHeck operations config file
8254        if operations_config_file is None:
8255            operations_config_file = param.get("calculation", {}).get(
8256                "calculation_config", None
8257            )
8258
8259        # operations config
8260        operations_config = self.get_config_json(
8261            name="calculations",
8262            config_dict=operations_config_dict,
8263            config_file=operations_config_file,
8264        )
8265
8266        # Upper keys
8267        operations_config = {k.upper(): v for k, v in operations_config.items()}
8268
8269        # Calculations
8270
8271        # Operations from param
8272        operations = param.get("calculation", {}).get("calculations", operations)
8273
8274        # Quick calculation - add
8275        if param.get("calculations", None):
8276
8277            # List of operations
8278            calculations_list = [
8279                value.strip() for value in param.get("calculations", "").split(",")
8280            ]
8281
8282            # Log
8283            log.info(f"Quick Calculations:")
8284            for calculation_key in calculations_list:
8285                log.info(f"   {calculation_key}")
8286
8287            # Create tmp operations (to keep operation order)
8288            operations_tmp = {}
8289            for calculation_operation in calculations_list:
8290                if calculation_operation.upper() not in operations_tmp:
8291                    log.debug(
8292                        f"{calculation_operation}.upper() not in {operations_tmp}"
8293                    )
8294                    operations_tmp[calculation_operation.upper()] = {}
8295                    add_value_into_dict(
8296                        dict_tree=operations_tmp,
8297                        sections=[
8298                            calculation_operation.upper(),
8299                        ],
8300                        value=operations.get(calculation_operation.upper(), {}),
8301                    )
8302            # Add operations already in param
8303            for calculation_operation in operations:
8304                if calculation_operation not in operations_tmp:
8305                    operations_tmp[calculation_operation] = operations.get(
8306                        calculation_operation, {}
8307                    )
8308
8309            # Update operations in param
8310            operations = operations_tmp
8311
8312        # Operations for calculation
8313        if not operations:
8314            operations = param.get("calculation", {}).get("calculations", {})
8315
8316        if operations:
8317            log.info(f"Calculations...")
8318
8319        # For each operations
8320        for operation_name in operations:
8321            operation_name = operation_name.upper()
8322            if operation_name not in [""]:
8323                if operation_name in operations_config:
8324                    log.info(f"Calculation '{operation_name}'")
8325                    operation = operations_config[operation_name]
8326                    operation_type = operation.get("type", "sql")
8327                    if operation_type == "python":
8328                        self.calculation_process_function(
8329                            operation=operation, operation_name=operation_name
8330                        )
8331                    elif operation_type == "sql":
8332                        self.calculation_process_sql(
8333                            operation=operation, operation_name=operation_name
8334                        )
8335                    else:
8336                        log.error(
8337                            f"Operations config: Type '{operation_type}' NOT available"
8338                        )
8339                        raise ValueError(
8340                            f"Operations config: Type '{operation_type}' NOT available"
8341                        )
8342                else:
8343                    log.error(
8344                        f"Operations config: Calculation '{operation_name}' NOT available"
8345                    )
8346                    raise ValueError(
8347                        f"Operations config: Calculation '{operation_name}' NOT available"
8348                    )
8349
8350        # Explode INFOS fields into table fields
8351        if self.get_explode_infos():
8352            self.explode_infos(
8353                prefix=self.get_explode_infos_prefix(),
8354                fields=self.get_explode_infos_fields(),
8355                force=True,
8356            )

It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function

param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }

def calculation_process_sql(self, operation: dict, operation_name: str = 'unknown') -> None:
8358    def calculation_process_sql(
8359        self, operation: dict, operation_name: str = "unknown"
8360    ) -> None:
8361        """
8362        The `calculation_process_sql` function takes in a mathematical operation as a string and
8363        performs the operation, updating the specified table with the result.
8364
8365        :param operation: The `operation` parameter is a dictionary that contains information about the
8366        mathematical operation to be performed. It includes the following keys:
8367        :type operation: dict
8368        :param operation_name: The `operation_name` parameter is a string that represents the name of
8369        the mathematical operation being performed. It is used for logging and error handling purposes,
8370        defaults to unknown
8371        :type operation_name: str (optional)
8372        """
8373
8374        # Operation infos
8375        operation_name = operation.get("name", "unknown")
8376        log.debug(f"process SQL {operation_name}")
8377        output_column_name = operation.get("output_column_name", operation_name)
8378        output_column_type = operation.get("output_column_type", "String")
8379        prefix = operation.get("explode_infos_prefix", "")
8380        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
8381        output_column_description = operation.get(
8382            "output_column_description", f"{operation_name} operation"
8383        )
8384        operation_query = operation.get("operation_query", None)
8385        if isinstance(operation_query, list):
8386            operation_query = " ".join(operation_query)
8387        operation_info_fields = operation.get("info_fields", [])
8388        operation_info_fields_check = operation.get("info_fields_check", False)
8389        operation_info = operation.get("operation_info", True)
8390        operation_table = operation.get(
8391            "table", self.get_table_variants(clause="alter")
8392        )
8393
8394        # table variants
8395        if operation_table:
8396            table_variants = operation_table
8397        else:
8398            table_variants = self.get_table_variants(clause="alter")
8399
8400        if operation_query:
8401
8402            # Info fields check
8403            operation_info_fields_check_result = True
8404            if operation_info_fields_check:
8405                header_infos = self.get_header().infos
8406                for info_field in operation_info_fields:
8407                    operation_info_fields_check_result = (
8408                        operation_info_fields_check_result
8409                        and info_field in header_infos
8410                    )
8411
8412            # If info fields available
8413            if operation_info_fields_check_result:
8414
8415                # Added_columns
8416                added_columns = []
8417
8418                # Create VCF header field
8419                vcf_reader = self.get_header()
8420                vcf_reader.infos[output_column_name] = vcf.parser._Info(
8421                    output_column_name,
8422                    ".",
8423                    output_column_type,
8424                    output_column_description,
8425                    "howard calculation",
8426                    "0",
8427                    self.code_type_map.get(output_column_type),
8428                )
8429
8430                # Explode infos if needed
8431                log.debug(f"calculation_process_sql prefix {prefix}")
8432                added_columns += self.explode_infos(
8433                    prefix=prefix,
8434                    fields=[output_column_name] + operation_info_fields,
8435                    force=False,
8436                    table=table_variants,
8437                )
8438
8439                # Create column
8440                added_column = self.add_column(
8441                    table_name=table_variants,
8442                    column_name=prefix + output_column_name,
8443                    column_type=output_column_type_sql,
8444                    default_value="null",
8445                )
8446                added_columns.append(added_column)
8447
8448                # Operation calculation
8449                try:
8450
8451                    # Query to update calculation column
8452                    sql_update = f"""
8453                        UPDATE {table_variants}
8454                        SET "{prefix}{output_column_name}" = ({operation_query})
8455                    """
8456                    self.conn.execute(sql_update)
8457
8458                    # Add to INFO
8459                    if operation_info:
8460                        sql_update_info = f"""
8461                            UPDATE {table_variants}
8462                            SET "INFO" =
8463                                concat(
8464                                    CASE
8465                                        WHEN "INFO" IS NOT NULL
8466                                        THEN concat("INFO", ';')
8467                                        ELSE ''
8468                                    END,
8469                                    '{output_column_name}=',
8470                                    "{prefix}{output_column_name}"
8471                                )
8472                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
8473                        """
8474                        self.conn.execute(sql_update_info)
8475
8476                except:
8477                    log.error(
8478                        f"Operations config: Calculation '{operation_name}' query failed"
8479                    )
8480                    raise ValueError(
8481                        f"Operations config: Calculation '{operation_name}' query failed"
8482                    )
8483
8484                # Remove added columns
8485                for added_column in added_columns:
8486                    log.debug(f"added_column: {added_column}")
8487                    self.drop_column(column=added_column)
8488
8489            else:
8490                log.error(
8491                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
8492                )
8493                raise ValueError(
8494                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
8495                )
8496
8497        else:
8498            log.error(
8499                f"Operations config: Calculation '{operation_name}' query NOT defined"
8500            )
8501            raise ValueError(
8502                f"Operations config: Calculation '{operation_name}' query NOT defined"
8503            )

The calculation_process_sql function takes in a mathematical operation as a string and performs the operation, updating the specified table with the result.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
def calculation_process_function(self, operation: dict, operation_name: str = 'unknown') -> None:
8505    def calculation_process_function(
8506        self, operation: dict, operation_name: str = "unknown"
8507    ) -> None:
8508        """
8509        The `calculation_process_function` takes in an operation dictionary and performs the specified
8510        function with the given parameters.
8511
8512        :param operation: The `operation` parameter is a dictionary that contains information about the
8513        operation to be performed. It has the following keys:
8514        :type operation: dict
8515        :param operation_name: The `operation_name` parameter is a string that represents the name of
8516        the operation being performed. It is used for logging purposes, defaults to unknown
8517        :type operation_name: str (optional)
8518        """
8519
8520        operation_name = operation["name"]
8521        log.debug(f"process Python {operation_name}")
8522        function_name = operation["function_name"]
8523        function_params = operation["function_params"]
8524        getattr(self, function_name)(*function_params)

The calculation_process_function takes in an operation dictionary and performs the specified function with the given parameters.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the operation to be performed. It has the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
def calculation_variant_id(self) -> None:
8526    def calculation_variant_id(self) -> None:
8527        """
8528        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
8529        updates the INFO field of a variants table with the variant ID.
8530        """
8531
8532        # variant_id annotation field
8533        variant_id_tag = self.get_variant_id_column()
8534        added_columns = [variant_id_tag]
8535
8536        # variant_id hgvs tags"
8537        vcf_infos_tags = {
8538            variant_id_tag: "howard variant ID annotation",
8539        }
8540
8541        # Variants table
8542        table_variants = self.get_table_variants()
8543
8544        # Header
8545        vcf_reader = self.get_header()
8546
8547        # Add variant_id to header
8548        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
8549            variant_id_tag,
8550            ".",
8551            "String",
8552            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
8553            "howard calculation",
8554            "0",
8555            self.code_type_map.get("String"),
8556        )
8557
8558        # Update
8559        sql_update = f"""
8560            UPDATE {table_variants}
8561            SET "INFO" = 
8562                concat(
8563                    CASE
8564                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8565                        THEN ''
8566                        ELSE concat("INFO", ';')
8567                    END,
8568                    '{variant_id_tag}=',
8569                    "{variant_id_tag}"
8570                )
8571        """
8572        self.conn.execute(sql_update)
8573
8574        # Remove added columns
8575        for added_column in added_columns:
8576            self.drop_column(column=added_column)

The function calculation_variant_id adds a variant ID annotation to a VCF file header and updates the INFO field of a variants table with the variant ID.

def calculation_extract_snpeff_hgvs( self, snpeff_hgvs: str = 'snpeff_hgvs', snpeff_field: str = 'ANN') -> None:
8578    def calculation_extract_snpeff_hgvs(
8579        self,
8580        snpeff_hgvs: str = "snpeff_hgvs",
8581        snpeff_field: str = "ANN",
8582    ) -> None:
8583        """
8584        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
8585        annotation field in a VCF file and adds them as a new column in the variants table.
8586
8587        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
8588        function is used to specify the name of the column that will store the HGVS nomenclatures
8589        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
8590        snpeff_hgvs
8591        :type snpeff_hgvs: str (optional)
8592        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
8593        function represents the field in the VCF file that contains SnpEff annotations. This field is
8594        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
8595        to ANN
8596        :type snpeff_field: str (optional)
8597        """
8598
8599        # Snpeff hgvs tags
8600        vcf_infos_tags = {
8601            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
8602        }
8603
8604        # Prefix
8605        prefix = self.get_explode_infos_prefix()
8606        if prefix:
8607            prefix = "INFO/"
8608
8609        # snpEff fields
8610        speff_ann_infos = prefix + snpeff_field
8611        speff_hgvs_infos = prefix + snpeff_hgvs
8612
8613        # Variants table
8614        table_variants = self.get_table_variants()
8615
8616        # Header
8617        vcf_reader = self.get_header()
8618
8619        # Add columns
8620        added_columns = []
8621
8622        # Explode HGVS field in column
8623        added_columns += self.explode_infos(fields=[snpeff_field])
8624
8625        if snpeff_field in vcf_reader.infos:
8626
8627            log.debug(vcf_reader.infos[snpeff_field])
8628
8629            # Extract ANN header
8630            ann_description = vcf_reader.infos[snpeff_field].desc
8631            pattern = r"'(.+?)'"
8632            match = re.search(pattern, ann_description)
8633            if match:
8634                ann_header_match = match.group(1).split(" | ")
8635                ann_header_desc = {}
8636                for i in range(len(ann_header_match)):
8637                    ann_header_info = "".join(
8638                        char for char in ann_header_match[i] if char.isalnum()
8639                    )
8640                    ann_header_desc[ann_header_info] = ann_header_match[i]
8641                if not ann_header_desc:
8642                    raise ValueError("Invalid header description format")
8643            else:
8644                raise ValueError("Invalid header description format")
8645
8646            # Create variant id
8647            variant_id_column = self.get_variant_id_column()
8648            added_columns += [variant_id_column]
8649
8650            # Create dataframe
8651            dataframe_snpeff_hgvs = self.get_query_to_df(
8652                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8653            )
8654
8655            # Create main NOMEN column
8656            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8657                speff_ann_infos
8658            ].apply(
8659                lambda x: extract_snpeff_hgvs(
8660                    str(x), header=list(ann_header_desc.values())
8661                )
8662            )
8663
8664            # Add snpeff_hgvs to header
8665            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
8666                snpeff_hgvs,
8667                ".",
8668                "String",
8669                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
8670                "howard calculation",
8671                "0",
8672                self.code_type_map.get("String"),
8673            )
8674
8675            # Update
8676            sql_update = f"""
8677                UPDATE variants
8678                SET "INFO" = 
8679                    concat(
8680                        CASE
8681                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8682                            THEN ''
8683                            ELSE concat("INFO", ';')
8684                        END,
8685                        CASE 
8686                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8687                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8688                            THEN concat(
8689                                    '{snpeff_hgvs}=',
8690                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8691                                )
8692                            ELSE ''
8693                        END
8694                    )
8695                FROM dataframe_snpeff_hgvs
8696                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8697
8698            """
8699            self.conn.execute(sql_update)
8700
8701            # Delete dataframe
8702            del dataframe_snpeff_hgvs
8703            gc.collect()
8704
8705        else:
8706
8707            log.warning(
8708                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8709            )
8710
8711        # Remove added columns
8712        for added_column in added_columns:
8713            self.drop_column(column=added_column)

The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff annotation field in a VCF file and adds them as a new column in the variants table.

Parameters
  • snpeff_hgvs: The snpeff_hgvs parameter in the calculation_extract_snpeff_hgvs function is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs
  • snpeff_field: The snpeff_field parameter in the calculation_extract_snpeff_hgvs function represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
def calculation_snpeff_ann_explode( self, uniquify: bool = True, output_format: str = 'fields', output_prefix: str = 'snpeff_', snpeff_field: str = 'ANN') -> None:
8715    def calculation_snpeff_ann_explode(
8716        self,
8717        uniquify: bool = True,
8718        output_format: str = "fields",
8719        output_prefix: str = "snpeff_",
8720        snpeff_field: str = "ANN",
8721    ) -> None:
8722        """
8723        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
8724        exploding the HGVS field and updating variant information accordingly.
8725
8726        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
8727        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
8728        it indicates that the output should be unique, meaning that duplicate entries should be removed,
8729        defaults to True
8730        :type uniquify: bool (optional)
8731        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
8732        function specifies the format in which the output annotations will be generated. It has a
8733        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
8734        format, defaults to fields
8735        :type output_format: str (optional)
8736        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
8737        method is used to specify the prefix that will be added to the output annotations generated
8738        during the calculation process. This prefix helps to differentiate the newly added annotations
8739        from existing ones in the output data. By default, the, defaults to ANN_
8740        :type output_prefix: str (optional)
8741        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
8742        function is used to specify the field in the VCF file that contains SnpEff annotations. This
8743        field will be processed to explode the HGVS annotations and update the variant information
8744        accordingly, defaults to ANN
8745        :type snpeff_field: str (optional)
8746        """
8747
8748        # SnpEff annotation field
8749        snpeff_hgvs = "snpeff_ann_explode"
8750
8751        # Snpeff hgvs tags
8752        vcf_infos_tags = {
8753            snpeff_hgvs: "Explode snpEff annotations",
8754        }
8755
8756        # Prefix
8757        prefix = self.get_explode_infos_prefix()
8758        if prefix:
8759            prefix = "INFO/"
8760
8761        # snpEff fields
8762        speff_ann_infos = prefix + snpeff_field
8763        speff_hgvs_infos = prefix + snpeff_hgvs
8764
8765        # Variants table
8766        table_variants = self.get_table_variants()
8767
8768        # Header
8769        vcf_reader = self.get_header()
8770
8771        # Add columns
8772        added_columns = []
8773
8774        # Explode HGVS field in column
8775        added_columns += self.explode_infos(fields=[snpeff_field])
8776        log.debug(f"snpeff_field={snpeff_field}")
8777        log.debug(f"added_columns={added_columns}")
8778
8779        if snpeff_field in vcf_reader.infos:
8780
8781            # Extract ANN header
8782            ann_description = vcf_reader.infos[snpeff_field].desc
8783            pattern = r"'(.+?)'"
8784            match = re.search(pattern, ann_description)
8785            if match:
8786                ann_header_match = match.group(1).split(" | ")
8787                ann_header = []
8788                ann_header_desc = {}
8789                for i in range(len(ann_header_match)):
8790                    ann_header_info = "".join(
8791                        char for char in ann_header_match[i] if char.isalnum()
8792                    )
8793                    ann_header.append(ann_header_info)
8794                    ann_header_desc[ann_header_info] = ann_header_match[i]
8795                if not ann_header_desc:
8796                    raise ValueError("Invalid header description format")
8797            else:
8798                raise ValueError("Invalid header description format")
8799
8800            # Create variant id
8801            variant_id_column = self.get_variant_id_column()
8802            added_columns += [variant_id_column]
8803
8804            # Create dataframe
8805            dataframe_snpeff_hgvs = self.get_query_to_df(
8806                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8807            )
8808
8809            # Create snpEff columns
8810            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8811                speff_ann_infos
8812            ].apply(
8813                lambda x: explode_snpeff_ann(
8814                    str(x),
8815                    uniquify=uniquify,
8816                    output_format=output_format,
8817                    prefix=output_prefix,
8818                    header=list(ann_header_desc.values()),
8819                )
8820            )
8821
8822            # Header
8823            ann_annotations_prefix = ""
8824            if output_format.upper() in ["JSON"]:
8825                ann_annotations_prefix = f"{output_prefix}="
8826                vcf_reader.infos[output_prefix] = vcf.parser._Info(
8827                    output_prefix,
8828                    ".",
8829                    "String",
8830                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8831                    + " - JSON format",
8832                    "howard calculation",
8833                    "0",
8834                    self.code_type_map.get("String"),
8835                )
8836            else:
8837                for ann_annotation in ann_header:
8838                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
8839                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
8840                        ann_annotation_id,
8841                        ".",
8842                        "String",
8843                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8844                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
8845                        "howard calculation",
8846                        "0",
8847                        self.code_type_map.get("String"),
8848                    )
8849
8850            # Update
8851            sql_update = f"""
8852                UPDATE variants
8853                SET "INFO" = 
8854                    concat(
8855                        CASE
8856                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8857                            THEN ''
8858                            ELSE concat("INFO", ';')
8859                        END,
8860                        CASE 
8861                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8862                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8863                            THEN concat(
8864                                '{ann_annotations_prefix}',
8865                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8866                                )
8867                            ELSE ''
8868                        END
8869                    )
8870                FROM dataframe_snpeff_hgvs
8871                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8872
8873            """
8874            self.conn.execute(sql_update)
8875
8876            # Delete dataframe
8877            del dataframe_snpeff_hgvs
8878            gc.collect()
8879
8880        else:
8881
8882            log.warning(
8883                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8884            )
8885
8886        # Remove added columns
8887        for added_column in added_columns:
8888            self.drop_column(column=added_column)

The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by exploding the HGVS field and updating variant information accordingly.

Parameters
  • uniquify: The uniquify parameter in the calculation_snpeff_ann_explode method is a boolean flag that determines whether the output should be uniquified or not. When set to True, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True
  • output_format: The output_format parameter in the calculation_snpeff_ann_explode function specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields
  • output_prefix: The output_prefix parameter in the calculation_snpeff_ann_explode method is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_
  • snpeff_field: The snpeff_field parameter in the calculation_snpeff_ann_explode function is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
def calculation_extract_nomen(self) -> None:
8890    def calculation_extract_nomen(self) -> None:
8891        """
8892        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
8893        """
8894
8895        # NOMEN field
8896        field_nomen_dict = "NOMEN_DICT"
8897
8898        # NOMEN structure
8899        nomen_dict = {
8900            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
8901            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
8902            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
8903            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
8904            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
8905            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
8906            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
8907            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
8908            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
8909            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
8910        }
8911
8912        # Param
8913        param = self.get_param()
8914
8915        # Prefix
8916        prefix = self.get_explode_infos_prefix()
8917
8918        # Header
8919        vcf_reader = self.get_header()
8920
8921        # Added columns
8922        added_columns = []
8923
8924        # Get HGVS field
8925        hgvs_field = (
8926            param.get("calculation", {})
8927            .get("calculations", {})
8928            .get("NOMEN", {})
8929            .get("options", {})
8930            .get("hgvs_field", "hgvs")
8931        )
8932
8933        # Get NOMEN pattern
8934        nomen_pattern = (
8935            param.get("calculation", {})
8936            .get("calculations", {})
8937            .get("NOMEN", {})
8938            .get("options", {})
8939            .get("pattern", None)
8940        )
8941
8942        # transcripts list of preference sources
8943        transcripts_sources = {}
8944
8945        # Get transcripts
8946        transcripts_file = (
8947            param.get("calculation", {})
8948            .get("calculations", {})
8949            .get("NOMEN", {})
8950            .get("options", {})
8951            .get("transcripts", None)
8952        )
8953        transcripts_file = full_path(transcripts_file)
8954        if transcripts_file:
8955            if os.path.exists(transcripts_file):
8956                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
8957                transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist()
8958                transcripts_sources["file"] = transcripts_from_file
8959            else:
8960                msg_err = f"Transcript file '{transcripts_file}' does NOT exist"
8961                log.error(msg_err)
8962                raise ValueError(msg_err)
8963
8964        # Get transcripts table
8965        transcripts_table = (
8966            param.get("calculation", {})
8967            .get("calculations", {})
8968            .get("NOMEN", {})
8969            .get("options", {})
8970            .get("transcripts_table", self.get_table_variants())
8971        )
8972        # Get transcripts column
8973        transcripts_column = (
8974            param.get("calculation", {})
8975            .get("calculations", {})
8976            .get("NOMEN", {})
8977            .get("options", {})
8978            .get("transcripts_column", None)
8979        )
8980
8981        if transcripts_table and transcripts_column:
8982            extra_field_transcript = f"{transcripts_table}.{transcripts_column}"
8983            # Explode if not exists
8984            added_columns += self.explode_infos(fields=[transcripts_column], table=transcripts_table)
8985        else:
8986            extra_field_transcript = f"NULL"
8987
8988        # Transcripts of preference source order
8989        transcripts_order = (
8990            param.get("calculation", {})
8991            .get("calculations", {})
8992            .get("NOMEN", {})
8993            .get("options", {})
8994            .get("transcripts_order", ["column", "file"])
8995        )
8996
8997        # Transcripts from file
8998        transcripts = transcripts_sources.get("file", [])
8999
9000        # Explode HGVS field in column
9001        added_columns += self.explode_infos(fields=[hgvs_field])
9002
9003        # extra infos
9004        extra_infos = self.get_extra_infos()
9005        extra_field = prefix + hgvs_field
9006
9007        if extra_field in extra_infos:
9008
9009            # Create dataframe
9010            dataframe_hgvs = self.get_query_to_df(
9011                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """
9012            )
9013
9014            # Create main NOMEN column
9015            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply(
9016                lambda x: find_nomen(
9017                    hgvs=x.hgvs,
9018                    transcript=x.transcript,
9019                    transcripts=transcripts,
9020                    pattern=nomen_pattern,
9021                    transcripts_source_order=transcripts_order,
9022                ),
9023                axis=1,
9024            )
9025
9026            # Explode NOMEN Structure and create SQL set for update
9027            sql_nomen_fields = []
9028            for nomen_field in nomen_dict:
9029
9030                # Explode each field into a column
9031                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
9032                    lambda x: dict(x).get(nomen_field, "")
9033                )
9034
9035                # Create VCF header field
9036                vcf_reader.infos[nomen_field] = vcf.parser._Info(
9037                    nomen_field,
9038                    ".",
9039                    "String",
9040                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
9041                    "howard calculation",
9042                    "0",
9043                    self.code_type_map.get("String"),
9044                )
9045                sql_nomen_fields.append(
9046                    f"""
9047                        CASE 
9048                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
9049                            THEN concat(
9050                                    ';{nomen_field}=',
9051                                    dataframe_hgvs."{nomen_field}"
9052                                )
9053                            ELSE ''
9054                        END
9055                    """
9056                )
9057
9058            # SQL set for update
9059            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
9060
9061            # Update
9062            sql_update = f"""
9063                UPDATE variants
9064                SET "INFO" = 
9065                    concat(
9066                        CASE
9067                            WHEN "INFO" IS NULL
9068                            THEN ''
9069                            ELSE "INFO"
9070                        END,
9071                        {sql_nomen_fields_set}
9072                    )
9073                FROM dataframe_hgvs
9074                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
9075                    AND variants."POS" = dataframe_hgvs."POS" 
9076                    AND variants."REF" = dataframe_hgvs."REF"
9077                    AND variants."ALT" = dataframe_hgvs."ALT"
9078            """
9079            self.conn.execute(sql_update)
9080
9081            # Delete dataframe
9082            del dataframe_hgvs
9083            gc.collect()
9084
9085        # Remove added columns
9086        for added_column in added_columns:
9087            self.drop_column(column=added_column)

This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.

def calculation_find_by_pipeline(self, tag: str = 'findbypipeline') -> None:
9089    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
9090        """
9091        The function `calculation_find_by_pipeline` performs a calculation to find the number of
9092        pipeline/sample for a variant and updates the variant information in a VCF file.
9093
9094        :param tag: The `tag` parameter is a string that represents the annotation field for the
9095        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
9096        VCF header and to update the corresponding field in the variants table, defaults to
9097        findbypipeline
9098        :type tag: str (optional)
9099        """
9100
9101        # if FORMAT and samples
9102        if (
9103            "FORMAT" in self.get_header_columns_as_list()
9104            and self.get_header_sample_list()
9105        ):
9106
9107            # findbypipeline annotation field
9108            findbypipeline_tag = tag
9109
9110            # VCF infos tags
9111            vcf_infos_tags = {
9112                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
9113            }
9114
9115            # Prefix
9116            prefix = self.get_explode_infos_prefix()
9117
9118            # Field
9119            findbypipeline_infos = prefix + findbypipeline_tag
9120
9121            # Variants table
9122            table_variants = self.get_table_variants()
9123
9124            # Header
9125            vcf_reader = self.get_header()
9126
9127            # Create variant id
9128            variant_id_column = self.get_variant_id_column()
9129            added_columns = [variant_id_column]
9130
9131            # variant_id, FORMAT and samples
9132            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9133                self.get_header_sample_list()
9134            )
9135
9136            # Create dataframe
9137            dataframe_findbypipeline = self.get_query_to_df(
9138                f""" SELECT {samples_fields} FROM {table_variants} """
9139            )
9140
9141            # Create findbypipeline column
9142            dataframe_findbypipeline[findbypipeline_infos] = (
9143                dataframe_findbypipeline.apply(
9144                    lambda row: findbypipeline(
9145                        row, samples=self.get_header_sample_list()
9146                    ),
9147                    axis=1,
9148                )
9149            )
9150
9151            # Add snpeff_hgvs to header
9152            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
9153                findbypipeline_tag,
9154                ".",
9155                "String",
9156                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
9157                "howard calculation",
9158                "0",
9159                self.code_type_map.get("String"),
9160            )
9161
9162            # Update
9163            sql_update = f"""
9164                UPDATE variants
9165                SET "INFO" = 
9166                    concat(
9167                        CASE
9168                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9169                            THEN ''
9170                            ELSE concat("INFO", ';')
9171                        END,
9172                        CASE 
9173                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
9174                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
9175                            THEN concat(
9176                                    '{findbypipeline_tag}=',
9177                                    dataframe_findbypipeline."{findbypipeline_infos}"
9178                                )
9179                            ELSE ''
9180                        END
9181                    )
9182                FROM dataframe_findbypipeline
9183                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
9184            """
9185            self.conn.execute(sql_update)
9186
9187            # Remove added columns
9188            for added_column in added_columns:
9189                self.drop_column(column=added_column)
9190
9191            # Delete dataframe
9192            del dataframe_findbypipeline
9193            gc.collect()

The function calculation_find_by_pipeline performs a calculation to find the number of pipeline/sample for a variant and updates the variant information in a VCF file.

Parameters
  • tag: The tag parameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
def calculation_genotype_concordance(self) -> None:
9195    def calculation_genotype_concordance(self) -> None:
9196        """
9197        The function `calculation_genotype_concordance` calculates the genotype concordance for
9198        multi-caller VCF files and updates the variant information in the database.
9199        """
9200
9201        # if FORMAT and samples
9202        if (
9203            "FORMAT" in self.get_header_columns_as_list()
9204            and self.get_header_sample_list()
9205        ):
9206
9207            # genotypeconcordance annotation field
9208            genotypeconcordance_tag = "genotypeconcordance"
9209
9210            # VCF infos tags
9211            vcf_infos_tags = {
9212                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
9213            }
9214
9215            # Prefix
9216            prefix = self.get_explode_infos_prefix()
9217
9218            # Field
9219            genotypeconcordance_infos = prefix + genotypeconcordance_tag
9220
9221            # Variants table
9222            table_variants = self.get_table_variants()
9223
9224            # Header
9225            vcf_reader = self.get_header()
9226
9227            # Create variant id
9228            variant_id_column = self.get_variant_id_column()
9229            added_columns = [variant_id_column]
9230
9231            # variant_id, FORMAT and samples
9232            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9233                self.get_header_sample_list()
9234            )
9235
9236            # Create dataframe
9237            dataframe_genotypeconcordance = self.get_query_to_df(
9238                f""" SELECT {samples_fields} FROM {table_variants} """
9239            )
9240
9241            # Create genotypeconcordance column
9242            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
9243                dataframe_genotypeconcordance.apply(
9244                    lambda row: genotypeconcordance(
9245                        row, samples=self.get_header_sample_list()
9246                    ),
9247                    axis=1,
9248                )
9249            )
9250
9251            # Add genotypeconcordance to header
9252            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
9253                genotypeconcordance_tag,
9254                ".",
9255                "String",
9256                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
9257                "howard calculation",
9258                "0",
9259                self.code_type_map.get("String"),
9260            )
9261
9262            # Update
9263            sql_update = f"""
9264                UPDATE variants
9265                SET "INFO" = 
9266                    concat(
9267                        CASE
9268                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9269                            THEN ''
9270                            ELSE concat("INFO", ';')
9271                        END,
9272                        CASE
9273                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
9274                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
9275                            THEN concat(
9276                                    '{genotypeconcordance_tag}=',
9277                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
9278                                )
9279                            ELSE ''
9280                        END
9281                    )
9282                FROM dataframe_genotypeconcordance
9283                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
9284            """
9285            self.conn.execute(sql_update)
9286
9287            # Remove added columns
9288            for added_column in added_columns:
9289                self.drop_column(column=added_column)
9290
9291            # Delete dataframe
9292            del dataframe_genotypeconcordance
9293            gc.collect()

The function calculation_genotype_concordance calculates the genotype concordance for multi-caller VCF files and updates the variant information in the database.

def calculation_barcode(self, tag: str = 'barcode') -> None:
9295    def calculation_barcode(self, tag: str = "barcode") -> None:
9296        """
9297        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
9298        updates the INFO field in the file with the calculated barcode values.
9299
9300        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
9301        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
9302        the default tag name is set to "barcode", defaults to barcode
9303        :type tag: str (optional)
9304        """
9305
9306        # if FORMAT and samples
9307        if (
9308            "FORMAT" in self.get_header_columns_as_list()
9309            and self.get_header_sample_list()
9310        ):
9311
9312            # barcode annotation field
9313            if not tag:
9314                tag = "barcode"
9315
9316            # VCF infos tags
9317            vcf_infos_tags = {
9318                tag: "barcode calculation (VaRank)",
9319            }
9320
9321            # Prefix
9322            prefix = self.get_explode_infos_prefix()
9323
9324            # Field
9325            barcode_infos = prefix + tag
9326
9327            # Variants table
9328            table_variants = self.get_table_variants()
9329
9330            # Header
9331            vcf_reader = self.get_header()
9332
9333            # Create variant id
9334            variant_id_column = self.get_variant_id_column()
9335            added_columns = [variant_id_column]
9336
9337            # variant_id, FORMAT and samples
9338            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9339                self.get_header_sample_list()
9340            )
9341
9342            # Create dataframe
9343            dataframe_barcode = self.get_query_to_df(
9344                f""" SELECT {samples_fields} FROM {table_variants} """
9345            )
9346
9347            # Create barcode column
9348            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
9349                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
9350            )
9351
9352            # Add barcode to header
9353            vcf_reader.infos[tag] = vcf.parser._Info(
9354                tag,
9355                ".",
9356                "String",
9357                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
9358                "howard calculation",
9359                "0",
9360                self.code_type_map.get("String"),
9361            )
9362
9363            # Update
9364            sql_update = f"""
9365                UPDATE {table_variants}
9366                SET "INFO" = 
9367                    concat(
9368                        CASE
9369                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9370                            THEN ''
9371                            ELSE concat("INFO", ';')
9372                        END,
9373                        CASE
9374                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
9375                            AND dataframe_barcode."{barcode_infos}" NOT NULL
9376                            THEN concat(
9377                                    '{tag}=',
9378                                    dataframe_barcode."{barcode_infos}"
9379                                )
9380                            ELSE ''
9381                        END
9382                    )
9383                FROM dataframe_barcode
9384                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
9385            """
9386            self.conn.execute(sql_update)
9387
9388            # Remove added columns
9389            for added_column in added_columns:
9390                self.drop_column(column=added_column)
9391
9392            # Delete dataframe
9393            del dataframe_barcode
9394            gc.collect()

The calculation_barcode function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode function is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
def calculation_barcode_family(self, tag: str = 'BCF') -> None:
9396    def calculation_barcode_family(self, tag: str = "BCF") -> None:
9397        """
9398        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
9399        and updates the INFO field in the file with the calculated barcode values.
9400
9401        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
9402        the barcode tag that will be added to the VCF file during the calculation process. If no value
9403        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
9404        :type tag: str (optional)
9405        """
9406
9407        # if FORMAT and samples
9408        if (
9409            "FORMAT" in self.get_header_columns_as_list()
9410            and self.get_header_sample_list()
9411        ):
9412
9413            # barcode annotation field
9414            if not tag:
9415                tag = "BCF"
9416
9417            # VCF infos tags
9418            vcf_infos_tags = {
9419                tag: "barcode family calculation",
9420                f"{tag}S": "barcode family samples",
9421            }
9422
9423            # Param
9424            param = self.get_param()
9425            log.debug(f"param={param}")
9426
9427            # Prefix
9428            prefix = self.get_explode_infos_prefix()
9429
9430            # PED param
9431            ped = (
9432                param.get("calculation", {})
9433                .get("calculations", {})
9434                .get("BARCODEFAMILY", {})
9435                .get("family_pedigree", None)
9436            )
9437            log.debug(f"ped={ped}")
9438
9439            # Load PED
9440            if ped:
9441
9442                # Pedigree is a file
9443                if isinstance(ped, str) and os.path.exists(full_path(ped)):
9444                    log.debug("Pedigree is file")
9445                    with open(full_path(ped)) as ped:
9446                        ped = yaml.safe_load(ped)
9447
9448                # Pedigree is a string
9449                elif isinstance(ped, str):
9450                    log.debug("Pedigree is str")
9451                    try:
9452                        ped = json.loads(ped)
9453                        log.debug("Pedigree is json str")
9454                    except ValueError as e:
9455                        ped_samples = ped.split(",")
9456                        ped = {}
9457                        for ped_sample in ped_samples:
9458                            ped[ped_sample] = ped_sample
9459
9460                # Pedigree is a dict
9461                elif isinstance(ped, dict):
9462                    log.debug("Pedigree is dict")
9463
9464                # Pedigree is not well formatted
9465                else:
9466                    msg_error = "Pedigree not well formatted"
9467                    log.error(msg_error)
9468                    raise ValueError(msg_error)
9469
9470                # Construct list
9471                ped_samples = list(ped.values())
9472
9473            else:
9474                log.debug("Pedigree not defined. Take all samples")
9475                ped_samples = self.get_header_sample_list()
9476                ped = {}
9477                for ped_sample in ped_samples:
9478                    ped[ped_sample] = ped_sample
9479
9480            # Check pedigree
9481            if not ped or len(ped) == 0:
9482                msg_error = f"Error in pedigree: samples {ped_samples}"
9483                log.error(msg_error)
9484                raise ValueError(msg_error)
9485
9486            # Log
9487            log.info(
9488                "Calculation 'BARCODEFAMILY' - Samples: "
9489                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
9490            )
9491            log.debug(f"ped_samples={ped_samples}")
9492
9493            # Field
9494            barcode_infos = prefix + tag
9495
9496            # Variants table
9497            table_variants = self.get_table_variants()
9498
9499            # Header
9500            vcf_reader = self.get_header()
9501
9502            # Create variant id
9503            variant_id_column = self.get_variant_id_column()
9504            added_columns = [variant_id_column]
9505
9506            # variant_id, FORMAT and samples
9507            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9508                ped_samples
9509            )
9510
9511            # Create dataframe
9512            dataframe_barcode = self.get_query_to_df(
9513                f""" SELECT {samples_fields} FROM {table_variants} """
9514            )
9515
9516            # Create barcode column
9517            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
9518                lambda row: barcode(row, samples=ped_samples), axis=1
9519            )
9520
9521            # Add barcode family to header
9522            # Add vaf_normalization to header
9523            vcf_reader.formats[tag] = vcf.parser._Format(
9524                id=tag,
9525                num=".",
9526                type="String",
9527                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
9528                type_code=self.code_type_map.get("String"),
9529            )
9530            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
9531                id=f"{tag}S",
9532                num=".",
9533                type="String",
9534                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
9535                type_code=self.code_type_map.get("String"),
9536            )
9537
9538            # Update
9539            # for sample in ped_samples:
9540            sql_update_set = []
9541            for sample in self.get_header_sample_list() + ["FORMAT"]:
9542                if sample in ped_samples:
9543                    value = f'dataframe_barcode."{barcode_infos}"'
9544                    value_samples = "'" + ",".join(ped_samples) + "'"
9545                elif sample == "FORMAT":
9546                    value = f"'{tag}'"
9547                    value_samples = f"'{tag}S'"
9548                else:
9549                    value = "'.'"
9550                    value_samples = "'.'"
9551                format_regex = r"[a-zA-Z0-9\s]"
9552                sql_update_set.append(
9553                    f"""
9554                        "{sample}" = 
9555                        concat(
9556                            CASE
9557                                WHEN {table_variants}."{sample}" = './.'
9558                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
9559                                ELSE {table_variants}."{sample}"
9560                            END,
9561                            ':',
9562                            {value},
9563                            ':',
9564                            {value_samples}
9565                        )
9566                    """
9567                )
9568
9569            sql_update_set_join = ", ".join(sql_update_set)
9570            sql_update = f"""
9571                UPDATE {table_variants}
9572                SET {sql_update_set_join}
9573                FROM dataframe_barcode
9574                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
9575            """
9576            self.conn.execute(sql_update)
9577
9578            # Remove added columns
9579            for added_column in added_columns:
9580                self.drop_column(column=added_column)
9581
9582            # Delete dataframe
9583            del dataframe_barcode
9584            gc.collect()

The calculation_barcode_family function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode_family function is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for the tag parameter, the default value used is "BCF", defaults to BCF
def calculation_trio(self) -> None:
9586    def calculation_trio(self) -> None:
9587        """
9588        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
9589        information to the INFO field of each variant.
9590        """
9591
9592        # if FORMAT and samples
9593        if (
9594            "FORMAT" in self.get_header_columns_as_list()
9595            and self.get_header_sample_list()
9596        ):
9597
9598            # trio annotation field
9599            trio_tag = "trio"
9600
9601            # VCF infos tags
9602            vcf_infos_tags = {
9603                "trio": "trio calculation",
9604            }
9605
9606            # Param
9607            param = self.get_param()
9608
9609            # Prefix
9610            prefix = self.get_explode_infos_prefix()
9611
9612            # Trio param
9613            trio_ped = (
9614                param.get("calculation", {})
9615                .get("calculations", {})
9616                .get("TRIO", {})
9617                .get("trio_pedigree", None)
9618            )
9619
9620            # Load trio
9621            if trio_ped:
9622
9623                # Trio pedigree is a file
9624                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
9625                    log.debug("TRIO pedigree is file")
9626                    with open(full_path(trio_ped)) as trio_ped:
9627                        trio_ped = yaml.safe_load(trio_ped)
9628
9629                # Trio pedigree is a string
9630                elif isinstance(trio_ped, str):
9631                    log.debug("TRIO pedigree is str")
9632                    try:
9633                        trio_ped = json.loads(trio_ped)
9634                        log.debug("TRIO pedigree is json str")
9635                    except ValueError as e:
9636                        trio_samples = trio_ped.split(",")
9637                        if len(trio_samples) == 3:
9638                            trio_ped = {
9639                                "father": trio_samples[0],
9640                                "mother": trio_samples[1],
9641                                "child": trio_samples[2],
9642                            }
9643                            log.debug("TRIO pedigree is list str")
9644                        else:
9645                            msg_error = "TRIO pedigree not well formatted"
9646                            log.error(msg_error)
9647                            raise ValueError(msg_error)
9648
9649                # Trio pedigree is a dict
9650                elif isinstance(trio_ped, dict):
9651                    log.debug("TRIO pedigree is dict")
9652
9653                # Trio pedigree is not well formatted
9654                else:
9655                    msg_error = "TRIO pedigree not well formatted"
9656                    log.error(msg_error)
9657                    raise ValueError(msg_error)
9658
9659                # Construct trio list
9660                trio_samples = [
9661                    trio_ped.get("father", ""),
9662                    trio_ped.get("mother", ""),
9663                    trio_ped.get("child", ""),
9664                ]
9665
9666            else:
9667                log.debug("TRIO pedigree not defined. Take the first 3 samples")
9668                samples_list = self.get_header_sample_list()
9669                if len(samples_list) >= 3:
9670                    trio_samples = self.get_header_sample_list()[0:3]
9671                    trio_ped = {
9672                        "father": trio_samples[0],
9673                        "mother": trio_samples[1],
9674                        "child": trio_samples[2],
9675                    }
9676                else:
9677                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
9678                    log.error(msg_error)
9679                    raise ValueError(msg_error)
9680
9681            # Check trio pedigree
9682            if not trio_ped or len(trio_ped) != 3:
9683                msg_error = f"Error in TRIO pedigree: {trio_ped}"
9684                log.error(msg_error)
9685                raise ValueError(msg_error)
9686
9687            # Log
9688            log.info(
9689                f"Calculation 'TRIO' - Samples: "
9690                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
9691            )
9692
9693            # Field
9694            trio_infos = prefix + trio_tag
9695
9696            # Variants table
9697            table_variants = self.get_table_variants()
9698
9699            # Header
9700            vcf_reader = self.get_header()
9701
9702            # Create variant id
9703            variant_id_column = self.get_variant_id_column()
9704            added_columns = [variant_id_column]
9705
9706            # variant_id, FORMAT and samples
9707            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9708                self.get_header_sample_list()
9709            )
9710
9711            # Create dataframe
9712            dataframe_trio = self.get_query_to_df(
9713                f""" SELECT {samples_fields} FROM {table_variants} """
9714            )
9715
9716            # Create trio column
9717            dataframe_trio[trio_infos] = dataframe_trio.apply(
9718                lambda row: trio(row, samples=trio_samples), axis=1
9719            )
9720
9721            # Add trio to header
9722            vcf_reader.infos[trio_tag] = vcf.parser._Info(
9723                trio_tag,
9724                ".",
9725                "String",
9726                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
9727                "howard calculation",
9728                "0",
9729                self.code_type_map.get("String"),
9730            )
9731
9732            # Update
9733            sql_update = f"""
9734                UPDATE {table_variants}
9735                SET "INFO" = 
9736                    concat(
9737                        CASE
9738                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9739                            THEN ''
9740                            ELSE concat("INFO", ';')
9741                        END,
9742                        CASE
9743                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
9744                             AND dataframe_trio."{trio_infos}" NOT NULL
9745                            THEN concat(
9746                                    '{trio_tag}=',
9747                                    dataframe_trio."{trio_infos}"
9748                                )
9749                            ELSE ''
9750                        END
9751                    )
9752                FROM dataframe_trio
9753                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
9754            """
9755            self.conn.execute(sql_update)
9756
9757            # Remove added columns
9758            for added_column in added_columns:
9759                self.drop_column(column=added_column)
9760
9761            # Delete dataframe
9762            del dataframe_trio
9763            gc.collect()

The calculation_trio function performs trio calculations on a VCF file by adding trio information to the INFO field of each variant.

def calculation_vaf_normalization(self) -> None:
9765    def calculation_vaf_normalization(self) -> None:
9766        """
9767        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
9768        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
9769        :return: The function does not return anything.
9770        """
9771
9772        # if FORMAT and samples
9773        if (
9774            "FORMAT" in self.get_header_columns_as_list()
9775            and self.get_header_sample_list()
9776        ):
9777
9778            # vaf_normalization annotation field
9779            vaf_normalization_tag = "VAF"
9780
9781            # VCF infos tags
9782            vcf_infos_tags = {
9783                "VAF": "VAF Variant Frequency",
9784            }
9785
9786            # Prefix
9787            prefix = self.get_explode_infos_prefix()
9788
9789            # Variants table
9790            table_variants = self.get_table_variants()
9791
9792            # Header
9793            vcf_reader = self.get_header()
9794
9795            # Do not calculate if VAF already exists
9796            if "VAF" in vcf_reader.formats:
9797                log.debug("VAF already on genotypes")
9798                return
9799
9800            # Create variant id
9801            variant_id_column = self.get_variant_id_column()
9802            added_columns = [variant_id_column]
9803
9804            # variant_id, FORMAT and samples
9805            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9806                f""" "{sample}" """ for sample in self.get_header_sample_list()
9807            )
9808
9809            # Create dataframe
9810            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
9811            log.debug(f"query={query}")
9812            dataframe_vaf_normalization = self.get_query_to_df(query=query)
9813
9814            vaf_normalization_set = []
9815
9816            # for each sample vaf_normalization
9817            for sample in self.get_header_sample_list():
9818                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
9819                    lambda row: vaf_normalization(row, sample=sample), axis=1
9820                )
9821                vaf_normalization_set.append(
9822                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
9823                )
9824
9825            # Add VAF to FORMAT
9826            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
9827                "FORMAT"
9828            ].apply(lambda x: str(x) + ":VAF")
9829            vaf_normalization_set.append(
9830                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
9831            )
9832
9833            # Add vaf_normalization to header
9834            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
9835                id=vaf_normalization_tag,
9836                num="1",
9837                type="Float",
9838                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
9839                type_code=self.code_type_map.get("Float"),
9840            )
9841
9842            # Create fields to add in INFO
9843            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
9844
9845            # Update
9846            sql_update = f"""
9847                UPDATE {table_variants}
9848                SET {sql_vaf_normalization_set}
9849                FROM dataframe_vaf_normalization
9850                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
9851
9852            """
9853            self.conn.execute(sql_update)
9854
9855            # Remove added columns
9856            for added_column in added_columns:
9857                self.drop_column(column=added_column)
9858
9859            # Delete dataframe
9860            del dataframe_vaf_normalization
9861            gc.collect()

The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency) normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.

Returns

The function does not return anything.

def calculation_genotype_stats(self, info: str = 'VAF') -> None:
9863    def calculation_genotype_stats(self, info: str = "VAF") -> None:
9864        """
9865        The `calculation_genotype_stats` function calculates genotype statistics for a given information
9866        field in a VCF file and updates the INFO column of the variants table with the calculated
9867        statistics.
9868
9869        :param info: The `info` parameter is a string that represents the type of information for which
9870        genotype statistics are calculated. It is used to generate various VCF info tags for the
9871        statistics, such as the number of occurrences, the list of values, the minimum value, the
9872        maximum value, the mean, the median, defaults to VAF
9873        :type info: str (optional)
9874        """
9875
9876        # if FORMAT and samples
9877        if (
9878            "FORMAT" in self.get_header_columns_as_list()
9879            and self.get_header_sample_list()
9880        ):
9881
9882            # vaf_stats annotation field
9883            vaf_stats_tag = info + "_stats"
9884
9885            # VCF infos tags
9886            vcf_infos_tags = {
9887                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
9888                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
9889                info + "_stats_min": f"genotype {info} Statistics - min {info}",
9890                info + "_stats_max": f"genotype {info} Statistics - max {info}",
9891                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
9892                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
9893                info
9894                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
9895            }
9896
9897            # Prefix
9898            prefix = self.get_explode_infos_prefix()
9899
9900            # Field
9901            vaf_stats_infos = prefix + vaf_stats_tag
9902
9903            # Variants table
9904            table_variants = self.get_table_variants()
9905
9906            # Header
9907            vcf_reader = self.get_header()
9908
9909            # Create variant id
9910            variant_id_column = self.get_variant_id_column()
9911            added_columns = [variant_id_column]
9912
9913            # variant_id, FORMAT and samples
9914            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9915                self.get_header_sample_list()
9916            )
9917
9918            # Create dataframe
9919            dataframe_vaf_stats = self.get_query_to_df(
9920                f""" SELECT {samples_fields} FROM {table_variants} """
9921            )
9922
9923            # Create vaf_stats column
9924            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
9925                lambda row: genotype_stats(
9926                    row, samples=self.get_header_sample_list(), info=info
9927                ),
9928                axis=1,
9929            )
9930
9931            # List of vcf tags
9932            sql_vaf_stats_fields = []
9933
9934            # Check all VAF stats infos
9935            for stat in vcf_infos_tags:
9936
9937                # Extract stats
9938                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
9939                    lambda x: dict(x).get(stat, "")
9940                )
9941
9942                # Add snpeff_hgvs to header
9943                vcf_reader.infos[stat] = vcf.parser._Info(
9944                    stat,
9945                    ".",
9946                    "String",
9947                    vcf_infos_tags.get(stat, "genotype statistics"),
9948                    "howard calculation",
9949                    "0",
9950                    self.code_type_map.get("String"),
9951                )
9952
9953                if len(sql_vaf_stats_fields):
9954                    sep = ";"
9955                else:
9956                    sep = ""
9957
9958                # Create fields to add in INFO
9959                sql_vaf_stats_fields.append(
9960                    f"""
9961                        CASE
9962                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
9963                            THEN concat(
9964                                    '{sep}{stat}=',
9965                                    dataframe_vaf_stats."{stat}"
9966                                )
9967                            ELSE ''
9968                        END
9969                    """
9970                )
9971
9972            # SQL set for update
9973            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
9974
9975            # Update
9976            sql_update = f"""
9977                UPDATE {table_variants}
9978                SET "INFO" = 
9979                    concat(
9980                        CASE
9981                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9982                            THEN ''
9983                            ELSE concat("INFO", ';')
9984                        END,
9985                        {sql_vaf_stats_fields_set}
9986                    )
9987                FROM dataframe_vaf_stats
9988                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
9989
9990            """
9991            self.conn.execute(sql_update)
9992
9993            # Remove added columns
9994            for added_column in added_columns:
9995                self.drop_column(column=added_column)
9996
9997            # Delete dataframe
9998            del dataframe_vaf_stats
9999            gc.collect()

The calculation_genotype_stats function calculates genotype statistics for a given information field in a VCF file and updates the INFO column of the variants table with the calculated statistics.

Parameters
  • info: The info parameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF
def calculation_transcripts_annotation(self, info_json: str = None, info_format: str = None) -> None:
10001    def calculation_transcripts_annotation(
10002        self, info_json: str = None, info_format: str = None
10003    ) -> None:
10004        """
10005        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
10006        field to it if transcripts are available.
10007
10008        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
10009        is a string parameter that represents the information field to be used in the transcripts JSON.
10010        It is used to specify the JSON format for the transcripts information. If no value is provided
10011        when calling the method, it defaults to "
10012        :type info_json: str
10013        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
10014        method is a string parameter that specifies the format of the information field to be used in
10015        the transcripts JSON. It is used to define the format of the information field
10016        :type info_format: str
10017        """
10018
10019        # Create transcripts table
10020        transcripts_table = self.create_transcript_view()
10021
10022        # Add info field
10023        if transcripts_table:
10024            self.transcript_view_to_variants(
10025                transcripts_table=transcripts_table,
10026                transcripts_info_field_json=info_json,
10027                transcripts_info_field_format=info_format,
10028            )
10029        else:
10030            log.info("No Transcripts to process. Check param.json file configuration")

The calculation_transcripts_annotation function creates a transcripts table and adds an info field to it if transcripts are available.

Parameters
  • info_json: The info_json parameter in the calculation_transcripts_annotation method is a string parameter that represents the information field to be used in the transcripts JSON. It is used to specify the JSON format for the transcripts information. If no value is provided when calling the method, it defaults to "
  • info_format: The info_format parameter in the calculation_transcripts_annotation method is a string parameter that specifies the format of the information field to be used in the transcripts JSON. It is used to define the format of the information field
def calculation_transcripts_prioritization(self) -> None:
10032    def calculation_transcripts_prioritization(self) -> None:
10033        """
10034        The function `calculation_transcripts_prioritization` creates a transcripts table and
10035        prioritizes transcripts based on certain criteria.
10036        """
10037
10038        # Create transcripts table
10039        transcripts_table = self.create_transcript_view()
10040
10041        # Add info field
10042        if transcripts_table:
10043            self.transcripts_prioritization(transcripts_table=transcripts_table)
10044        else:
10045            log.info("No Transcripts to process. Check param.json file configuration")

The function calculation_transcripts_prioritization creates a transcripts table and prioritizes transcripts based on certain criteria.

def calculation_transcripts_export(self) -> None:
10047    def calculation_transcripts_export(self) -> None:
10048        """ """
10049
10050        # Create transcripts table
10051        transcripts_table = self.create_transcript_view()
10052
10053        # Add info field
10054        if transcripts_table:
10055            self.transcripts_export(transcripts_table=transcripts_table)
10056        else:
10057            log.info("No Transcripts to process. Check param.json file configuration")
def transcripts_export(self, transcripts_table: str = None, param: dict = {}) -> bool:
10063    def transcripts_export(
10064        self, transcripts_table: str = None, param: dict = {}
10065    ) -> bool:
10066        """ """
10067
10068        log.debug("Start transcripts export...")
10069
10070        # Param
10071        if not param:
10072            param = self.get_param()
10073
10074        # Param export
10075        param_transcript_export = param.get("transcripts", {}).get("export", {})
10076
10077        # Output file
10078        transcripts_export_output = param_transcript_export.get("output", None)
10079
10080        if not param_transcript_export or not transcripts_export_output:
10081            log.warning(f"No transcriipts export parameters defined!")
10082            return False
10083
10084        # List of transcripts annotations
10085        query_describe = f"""
10086            SELECT column_name
10087            FROM (
10088                    DESCRIBE SELECT * FROM {transcripts_table}
10089                )
10090            WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO')
10091        """
10092        transcripts_annotations_list = list(
10093            self.get_query_to_df(query=query_describe)["column_name"]
10094        )
10095
10096        # Create transcripts table for export
10097        transcripts_table_export = f"{transcripts_table}_export_" + "".join(
10098            random.choices(string.ascii_uppercase + string.digits, k=10)
10099        )
10100        query_create_transcripts_table_export = f"""
10101            CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table})
10102        """
10103        self.execute_query(query=query_create_transcripts_table_export)
10104
10105        # Output file format
10106        transcripts_export_output_format = get_file_format(
10107            filename=transcripts_export_output
10108        )
10109
10110        # Format VCF - construct INFO
10111        if transcripts_export_output_format in ["vcf"]:
10112
10113            # Construct query update INFO and header
10114            query_update_info = []
10115            for field in transcripts_annotations_list:
10116
10117                # If field not in header
10118                if field not in self.get_header_infos_list():
10119
10120                    # Add PZ Transcript in header
10121                    self.get_header().infos[field] = vcf.parser._Info(
10122                        field,
10123                        ".",
10124                        "String",
10125                        f"Annotation '{field}' from transcript view",
10126                        "unknown",
10127                        "unknown",
10128                        0,
10129                    )
10130
10131                # Add field as INFO/tag
10132                query_update_info.append(
10133                    f"""
10134                        CASE
10135                            WHEN "{field}" IS NOT NULL
10136                            THEN concat('{field}=', "{field}", ';')    
10137                            ELSE ''     
10138                        END
10139                        """
10140                )
10141
10142            # Query param
10143            query_update_info_value = (
10144                f""" concat('',  {", ".join(query_update_info)}) """
10145            )
10146            query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """
10147
10148        else:
10149
10150            # Query param
10151            query_update_info_value = f""" NULL """
10152            query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """
10153
10154        # Update query INFO column
10155        query_update = f"""
10156            UPDATE {transcripts_table_export}
10157            SET INFO = {query_update_info_value}
10158
10159        """
10160        self.execute_query(query=query_update)
10161
10162        # Export
10163        self.export_output(
10164            output_file=transcripts_export_output,
10165            query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """,
10166        )
10167
10168        # Drop transcripts export table
10169        query_drop_transcripts_table_export = f"""
10170            DROP TABLE {transcripts_table_export}
10171        """
10172        self.execute_query(query=query_drop_transcripts_table_export)
def transcripts_prioritization(self, transcripts_table: str = None, param: dict = {}) -> bool:
10174    def transcripts_prioritization(
10175        self, transcripts_table: str = None, param: dict = {}
10176    ) -> bool:
10177        """
10178        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
10179        and updates the variants table with the prioritized information.
10180
10181        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10182        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
10183        This parameter is used to identify the table where the transcripts data is stored for the
10184        prioritization process
10185        :type transcripts_table: str
10186        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
10187        that contains various configuration settings for the prioritization process of transcripts. It
10188        is used to customize the behavior of the prioritization algorithm and includes settings such as
10189        the prefix for prioritization fields, default profiles, and other
10190        :type param: dict
10191        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
10192        transcripts prioritization process is successfully completed, and `False` if there are any
10193        issues or if no profile is defined for transcripts prioritization.
10194        """
10195
10196        log.debug("Start transcripts prioritization...")
10197
10198        # Param
10199        if not param:
10200            param = self.get_param()
10201
10202        # Variants table
10203        table_variants = self.get_table_variants()
10204
10205        # Transcripts table
10206        if transcripts_table is None:
10207            transcripts_table = self.create_transcript_view(
10208                transcripts_table="transcripts", param=param
10209            )
10210        if transcripts_table is None:
10211            msg_err = "No Transcripts table availalble"
10212            log.error(msg_err)
10213            raise ValueError(msg_err)
10214        log.debug(f"transcripts_table={transcripts_table}")
10215
10216        # Get transcripts columns
10217        columns_as_list_query = f"""
10218            DESCRIBE {transcripts_table}
10219        """
10220        columns_as_list = list(
10221            self.get_query_to_df(columns_as_list_query)["column_name"]
10222        )
10223
10224        # Create INFO if not exists
10225        if "INFO" not in columns_as_list:
10226            query_add_info = f"""
10227                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
10228            """
10229            self.execute_query(query_add_info)
10230
10231        # Prioritization param and Force only PZ Score and Flag
10232        pz_param = param.get("transcripts", {}).get("prioritization", {})
10233
10234        # PZ profile by default
10235        pz_profile_default = (
10236            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
10237        )
10238
10239        # Exit if no profile
10240        if pz_profile_default is None:
10241            log.warning("No profile defined for transcripts prioritization")
10242            return False
10243
10244        # PZ fields
10245        pz_param_pzfields = {}
10246
10247        # PZ field transcripts
10248        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
10249
10250        # Add PZ Transcript in header
10251        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
10252            pz_fields_transcripts,
10253            ".",
10254            "String",
10255            f"Transcript selected from prioritization process, profile {pz_profile_default}",
10256            "unknown",
10257            "unknown",
10258            code_type_map["String"],
10259        )
10260
10261        # Mandatory fields
10262        pz_mandatory_fields_list = [
10263            "Score",
10264            "Flag",
10265            "Tags",
10266            "Comment",
10267            "Infos",
10268            "Class",
10269        ]
10270        pz_mandatory_fields = []
10271        for pz_mandatory_field in pz_mandatory_fields_list:
10272            pz_mandatory_fields.append(
10273                pz_param.get("pzprefix", "PTZ") + pz_mandatory_field
10274            )
10275
10276        # PZ fields in param
10277        for pz_field in pz_param.get("pzfields", []):
10278            if pz_field in pz_mandatory_fields_list:
10279                pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = (
10280                    pz_param.get("pzprefix", "PTZ") + pz_field
10281                )
10282            else:
10283                pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field
10284                pz_param_pzfields[pz_field] = pz_field_new
10285
10286                # Add PZ Transcript in header
10287                self.get_header().infos[pz_field_new] = vcf.parser._Info(
10288                    pz_field_new,
10289                    ".",
10290                    "String",
10291                    f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}",
10292                    "unknown",
10293                    "unknown",
10294                    code_type_map["String"],
10295                )
10296
10297        # PZ fields param
10298        pz_param["pzfields"] = pz_mandatory_fields
10299
10300        # Prioritization
10301        prioritization_result = self.prioritization(
10302            table=transcripts_table,
10303            pz_param=param.get("transcripts", {}).get("prioritization", {}),
10304        )
10305        if not prioritization_result:
10306            log.warning("Transcripts prioritization not processed")
10307            return False
10308
10309        # PZ fields sql query
10310        query_update_select_list = []
10311        query_update_concat_list = []
10312        query_update_order_list = []
10313        for pz_param_pzfield in set(
10314            list(pz_param_pzfields.keys()) + pz_mandatory_fields
10315        ):
10316            query_update_select_list.append(f" {pz_param_pzfield}, ")
10317
10318        for pz_param_pzfield in pz_param_pzfields:
10319            query_update_concat_list.append(
10320                f"""
10321                    , CASE 
10322                        WHEN {pz_param_pzfield} IS NOT NULL
10323                        THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield})
10324                        ELSE ''
10325                    END
10326                """
10327            )
10328
10329        # Order by
10330        pz_orders = (
10331            param.get("transcripts", {})
10332            .get("prioritization", {})
10333            .get("prioritization_transcripts_order", {})
10334        )
10335        if not pz_orders:
10336            pz_orders = {
10337                pz_param.get("pzprefix", "PTZ") + "Flag": "DESC",
10338                pz_param.get("pzprefix", "PTZ") + "Score": "DESC",
10339            }
10340        for pz_order in pz_orders:
10341            query_update_order_list.append(
10342                f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """
10343            )
10344
10345        # Fields to explode
10346        fields_to_explode = (
10347            list(pz_param_pzfields.keys())
10348            + pz_mandatory_fields
10349            + list(pz_orders.keys())
10350        )
10351        # Remove transcript column as a specific transcript column
10352        if "transcript" in fields_to_explode:
10353            fields_to_explode.remove("transcript")
10354
10355        # Fields intranscripts table
10356        query_transcripts_table = f"""
10357            DESCRIBE SELECT * FROM {transcripts_table}
10358        """
10359        query_transcripts_table = self.get_query_to_df(query=query_transcripts_table)
10360
10361        # Check fields to explode
10362        for field_to_explode in fields_to_explode:
10363            if field_to_explode not in self.get_header_infos_list() + list(
10364                query_transcripts_table.column_name
10365            ):
10366                msg_err = f"INFO/{field_to_explode} NOT IN header"
10367                log.error(msg_err)
10368                raise ValueError(msg_err)
10369
10370        # Explode fields to explode
10371        self.explode_infos(
10372            table=transcripts_table,
10373            fields=fields_to_explode,
10374        )
10375
10376        # Transcript preference file
10377        transcripts_preference_file = (
10378            param.get("transcripts", {})
10379            .get("prioritization", {})
10380            .get("prioritization_transcripts", {})
10381        )
10382        transcripts_preference_file = full_path(transcripts_preference_file)
10383
10384        # Transcript preference forced
10385        transcript_preference_force = (
10386            param.get("transcripts", {})
10387            .get("prioritization", {})
10388            .get("prioritization_transcripts_force", False)
10389        )
10390        # Transcript version forced
10391        transcript_version_force = (
10392            param.get("transcripts", {})
10393            .get("prioritization", {})
10394            .get("prioritization_transcripts_version_force", False)
10395        )
10396
10397        # Transcripts Ranking
10398        if transcripts_preference_file:
10399
10400            # Transcripts file to dataframe
10401            if os.path.exists(transcripts_preference_file):
10402                transcripts_preference_dataframe = transcripts_file_to_df(
10403                    transcripts_preference_file
10404                )
10405            else:
10406                log.error(
10407                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10408                )
10409                raise ValueError(
10410                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10411                )
10412
10413            # Order by depending to transcript preference forcing
10414            if transcript_preference_force:
10415                order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """
10416            else:
10417                order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """
10418
10419            # Transcript columns joined depend on version consideration
10420            if transcript_version_force:
10421                transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """
10422            else:
10423                transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """
10424
10425            # Query ranking for update
10426            query_update_ranking = f"""
10427                SELECT
10428                    "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)}
10429                    ROW_NUMBER() OVER (
10430                        PARTITION BY "#CHROM", POS, REF, ALT
10431                        ORDER BY {order_by}
10432                    ) AS rn
10433                FROM {transcripts_table}
10434                LEFT JOIN 
10435                    (
10436                        SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order
10437                        FROM transcripts_preference_dataframe
10438                    ) AS transcripts_preference
10439                ON {transcripts_version_join}
10440            """
10441
10442        else:
10443
10444            # Query ranking for update
10445            query_update_ranking = f"""
10446                SELECT
10447                    "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)}
10448                    ROW_NUMBER() OVER (
10449                        PARTITION BY "#CHROM", POS, REF, ALT
10450                        ORDER BY {" , ".join(query_update_order_list)}
10451                    ) AS rn
10452                FROM {transcripts_table}
10453            """
10454
10455        # Export Transcripts prioritization infos to variants table
10456        query_update = f"""
10457            WITH RankedTranscripts AS (
10458                {query_update_ranking}
10459            )
10460            UPDATE {table_variants}
10461                SET
10462                INFO = CONCAT(CASE
10463                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10464                            THEN ''
10465                            ELSE concat("INFO", ';')
10466                        END,
10467                        concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)})
10468                        )
10469            FROM
10470                RankedTranscripts
10471            WHERE
10472                rn = 1
10473                AND variants."#CHROM" = RankedTranscripts."#CHROM"
10474                AND variants."POS" = RankedTranscripts."POS"
10475                AND variants."REF" = RankedTranscripts."REF"
10476                AND variants."ALT" = RankedTranscripts."ALT"     
10477        """
10478
10479        # log.debug(f"query_update={query_update}")
10480        self.execute_query(query=query_update)
10481
10482        # Return
10483        return True

The transcripts_prioritization function prioritizes transcripts based on certain parameters and updates the variants table with the prioritized information.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table containing transcripts data. If no value is provided, it defaults to "transcripts". This parameter is used to identify the table where the transcripts data is stored for the prioritization process
  • param: The param parameter in the transcripts_prioritization method is a dictionary that contains various configuration settings for the prioritization process of transcripts. It is used to customize the behavior of the prioritization algorithm and includes settings such as the prefix for prioritization fields, default profiles, and other
Returns

The function transcripts_prioritization returns a boolean value True if the transcripts prioritization process is successfully completed, and False if there are any issues or if no profile is defined for transcripts prioritization.

def create_transcript_view_from_columns_map( self, transcripts_table: str = 'transcripts', columns_maps: dict = {}, added_columns: list = [], temporary_tables: list = None, annotation_fields: list = None, column_rename: dict = {}, column_clean: bool = False, column_case: str = None) -> tuple[list, list, list]:
10485    def create_transcript_view_from_columns_map(
10486        self,
10487        transcripts_table: str = "transcripts",
10488        columns_maps: dict = {},
10489        added_columns: list = [],
10490        temporary_tables: list = None,
10491        annotation_fields: list = None,
10492        column_rename: dict = {},
10493        column_clean: bool = False,
10494        column_case: str = None,
10495    ) -> tuple[list, list, list]:
10496        """
10497        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
10498        specified columns mapping for transcripts data.
10499
10500        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10501        of the table where the transcripts data is stored or will be stored in the database. This table
10502        typically contains information about transcripts such as Ensembl transcript IDs, gene names,
10503        scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
10504        :type transcripts_table: str (optional)
10505        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information
10506        about how to map columns from a transcripts table to create a view. Each entry in the
10507        `columns_maps` list represents a mapping configuration for a specific set of columns. It
10508        typically includes details such as the main transcript column and additional information columns
10509        :type columns_maps: dict
10510        :param added_columns: The `added_columns` parameter in the
10511        `create_transcript_view_from_columns_map` function is a list that stores the additional columns
10512        that will be added to the view being created based on the columns map provided. These columns
10513        are generated by exploding the transcript information columns along with the main transcript
10514        column
10515        :type added_columns: list
10516        :param temporary_tables: The `temporary_tables` parameter in the
10517        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
10518        tables created during the process of creating a transcript view from a columns map. These
10519        temporary tables are used to store intermediate results or transformations before the final view
10520        is generated
10521        :type temporary_tables: list
10522        :param annotation_fields: The `annotation_fields` parameter in the
10523        `create_transcript_view_from_columns_map` function is a list that stores the fields that are
10524        used for annotation in the query view creation process. These fields are extracted from the
10525        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
10526        :type annotation_fields: list
10527        :param column_rename: The `column_rename` parameter in the
10528        `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify
10529        custom renaming for columns during the creation of the temporary table view. This parameter
10530        provides a mapping of original column names to the desired renamed column names. By using this
10531        parameter,
10532        :type column_rename: dict
10533        :param column_clean: The `column_clean` parameter in the
10534        `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the
10535        column values should be cleaned or not. If set to `True`, the column values will be cleaned by
10536        removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to
10537        False
10538        :type column_clean: bool (optional)
10539        :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map`
10540        function is used to specify the case transformation to be applied to the columns during the view
10541        creation process. It allows you to control whether the column values should be converted to
10542        lowercase, uppercase, or remain unchanged
10543        :type column_case: str
10544        :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three
10545        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
10546        """
10547
10548        log.debug("Start transcrpts view creation from columns map...")
10549
10550        # "from_columns_map": [
10551        #     {
10552        #         "transcripts_column": "Ensembl_transcriptid",
10553        #         "transcripts_infos_columns": [
10554        #             "genename",
10555        #             "Ensembl_geneid",
10556        #             "LIST_S2_score",
10557        #             "LIST_S2_pred",
10558        #         ],
10559        #     },
10560        #     {
10561        #         "transcripts_column": "Ensembl_transcriptid",
10562        #         "transcripts_infos_columns": [
10563        #             "genename",
10564        #             "VARITY_R_score",
10565        #             "Aloft_pred",
10566        #         ],
10567        #     },
10568        # ],
10569
10570        # Init
10571        if temporary_tables is None:
10572            temporary_tables = []
10573        if annotation_fields is None:
10574            annotation_fields = []
10575
10576        # Variants table
10577        table_variants = self.get_table_variants()
10578
10579        for columns_map in columns_maps:
10580
10581            # Transcript column
10582            transcripts_column = columns_map.get("transcripts_column", None)
10583
10584            # Transcripts infos columns
10585            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
10586
10587            # Transcripts infos columns rename
10588            column_rename = columns_map.get("column_rename", column_rename)
10589
10590            # Transcripts infos columns clean
10591            column_clean = columns_map.get("column_clean", column_clean)
10592
10593            # Transcripts infos columns case
10594            column_case = columns_map.get("column_case", column_case)
10595
10596            if transcripts_column is not None:
10597
10598                # Explode
10599                added_columns += self.explode_infos(
10600                    fields=[transcripts_column] + transcripts_infos_columns
10601                )
10602
10603                # View clauses
10604                clause_select_variants = []
10605                clause_select_tanscripts = []
10606                for field in [transcripts_column] + transcripts_infos_columns:
10607
10608                    # AS field
10609                    as_field = field
10610
10611                    # Rename
10612                    if column_rename:
10613                        as_field = column_rename.get(as_field, as_field)
10614
10615                    # Clean
10616                    if column_clean:
10617                        as_field = clean_annotation_field(as_field)
10618
10619                    # Case
10620                    if column_case:
10621                        if column_case.lower() in ["lower"]:
10622                            as_field = as_field.lower()
10623                        elif column_case.lower() in ["upper"]:
10624                            as_field = as_field.upper()
10625
10626                    # Clause select Variants
10627                    clause_select_variants.append(
10628                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10629                    )
10630
10631                    if field in [transcripts_column]:
10632                        clause_select_tanscripts.append(
10633                            f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10634                        )
10635                    else:
10636                        clause_select_tanscripts.append(
10637                            f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """
10638                        )
10639                        annotation_fields.append(as_field)
10640
10641                # Querey View
10642                query = f""" 
10643                    SELECT
10644                        "#CHROM", POS, REF, ALT, INFO,
10645                        "{transcripts_column}" AS 'transcript',
10646                        {", ".join(clause_select_tanscripts)}
10647                    FROM (
10648                        SELECT 
10649                            "#CHROM", POS, REF, ALT, INFO,
10650                            {", ".join(clause_select_variants)}
10651                        FROM {table_variants}
10652                        )
10653                    WHERE "{transcripts_column}" IS NOT NULL
10654                """
10655
10656                # Create temporary table
10657                temporary_table = transcripts_table + "".join(
10658                    random.choices(string.ascii_uppercase + string.digits, k=10)
10659                )
10660
10661                # Temporary_tables
10662                temporary_tables.append(temporary_table)
10663                query_view = f"""
10664                    CREATE TEMPORARY TABLE {temporary_table}
10665                    AS ({query})
10666                """
10667                self.execute_query(query=query_view)
10668
10669        return added_columns, temporary_tables, annotation_fields

The create_transcript_view_from_columns_map function generates a temporary table view based on specified columns mapping for transcripts data.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table where the transcripts data is stored or will be stored in the database. This table typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
  • columns_maps: The columns_maps parameter is a dictionary that contains information about how to map columns from a transcripts table to create a view. Each entry in the columns_maps list represents a mapping configuration for a specific set of columns. It typically includes details such as the main transcript column and additional information columns
  • added_columns: The added_columns parameter in the create_transcript_view_from_columns_map function is a list that stores the additional columns that will be added to the view being created based on the columns map provided. These columns are generated by exploding the transcript information columns along with the main transcript column
  • temporary_tables: The temporary_tables parameter in the create_transcript_view_from_columns_map function is a list that stores the names of temporary tables created during the process of creating a transcript view from a columns map. These temporary tables are used to store intermediate results or transformations before the final view is generated
  • annotation_fields: The annotation_fields parameter in the create_transcript_view_from_columns_map function is a list that stores the fields that are used for annotation in the query view creation process. These fields are extracted from the transcripts_column and transcripts_infos_columns specified in the `columns
  • column_rename: The column_rename parameter in the create_transcript_view_from_columns_map function is a dictionary that allows you to specify custom renaming for columns during the creation of the temporary table view. This parameter provides a mapping of original column names to the desired renamed column names. By using this parameter,
  • column_clean: The column_clean parameter in the create_transcript_view_from_columns_map function is a boolean flag that determines whether the column values should be cleaned or not. If set to True, the column values will be cleaned by removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to False
  • column_case: The column_case parameter in the create_transcript_view_from_columns_map function is used to specify the case transformation to be applied to the columns during the view creation process. It allows you to control whether the column values should be converted to lowercase, uppercase, or remain unchanged
Returns

The create_transcript_view_from_columns_map function returns a tuple containing three lists: added_columns, temporary_tables, and annotation_fields.

def create_transcript_view_from_column_format( self, transcripts_table: str = 'transcripts', column_formats: dict = {}, temporary_tables: list = None, annotation_fields: list = None, column_rename: dict = {}, column_clean: bool = False, column_case: str = None) -> tuple[list, list, list]:
10671    def create_transcript_view_from_column_format(
10672        self,
10673        transcripts_table: str = "transcripts",
10674        column_formats: dict = {},
10675        temporary_tables: list = None,
10676        annotation_fields: list = None,
10677        column_rename: dict = {},
10678        column_clean: bool = False,
10679        column_case: str = None,
10680    ) -> tuple[list, list, list]:
10681        """
10682        The `create_transcript_view_from_column_format` function generates a transcript view based on
10683        specified column formats, adds additional columns and annotation fields, and returns the list of
10684        temporary tables and annotation fields.
10685
10686        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10687        of the table containing the transcripts data. This table will be used as the base table for
10688        creating the transcript view. The default value for this parameter is "transcripts", but you can
10689        provide a different table name if needed, defaults to transcripts
10690        :type transcripts_table: str (optional)
10691        :param column_formats: The `column_formats` parameter is a dictionary that contains information
10692        about the columns to be used for creating the transcript view. Each entry in the dictionary
10693        specifies the mapping between a transcripts column and a transcripts infos column. This
10694        parameter allows you to define how the columns from the transcripts table should be transformed
10695        or mapped
10696        :type column_formats: dict
10697        :param temporary_tables: The `temporary_tables` parameter in the
10698        `create_transcript_view_from_column_format` function is a list that stores the names of
10699        temporary views created during the process of creating a transcript view from a column format.
10700        These temporary views are used to manipulate and extract data before generating the final
10701        transcript view
10702        :type temporary_tables: list
10703        :param annotation_fields: The `annotation_fields` parameter in the
10704        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
10705        that are extracted from the temporary views created during the process. These annotation fields
10706        are obtained by querying the temporary views and extracting the column names excluding specific
10707        columns like `#CH
10708        :type annotation_fields: list
10709        :param column_rename: The `column_rename` parameter in the
10710        `create_transcript_view_from_column_format` function is a dictionary that allows you to specify
10711        custom renaming of columns in the transcripts infos table. By providing a mapping of original
10712        column names to new column names in this dictionary, you can rename specific columns during the
10713        process
10714        :type column_rename: dict
10715        :param column_clean: The `column_clean` parameter in the
10716        `create_transcript_view_from_column_format` function is a boolean flag that determines whether
10717        the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns
10718        will be cleaned during the creation of the transcript view based on the specified column format,
10719        defaults to False
10720        :type column_clean: bool (optional)
10721        :param column_case: The `column_case` parameter in the
10722        `create_transcript_view_from_column_format` function is used to specify the case transformation
10723        to be applied to the columns in the transcript view. It can be set to either "upper" or "lower"
10724        to convert the column names to uppercase or lowercase, respectively
10725        :type column_case: str
10726        :return: The `create_transcript_view_from_column_format` function returns two lists:
10727        `temporary_tables` and `annotation_fields`.
10728        """
10729
10730        log.debug("Start transcrpts view creation from column format...")
10731
10732        #  "from_column_format": [
10733        #     {
10734        #         "transcripts_column": "ANN",
10735        #         "transcripts_infos_column": "Feature_ID",
10736        #     }
10737        # ],
10738
10739        # Init
10740        if temporary_tables is None:
10741            temporary_tables = []
10742        if annotation_fields is None:
10743            annotation_fields = []
10744
10745        for column_format in column_formats:
10746
10747            # annotation field and transcript annotation field
10748            annotation_field = column_format.get("transcripts_column", "ANN")
10749            transcript_annotation = column_format.get(
10750                "transcripts_infos_column", "Feature_ID"
10751            )
10752
10753            # Transcripts infos columns rename
10754            column_rename = column_format.get("column_rename", column_rename)
10755
10756            # Transcripts infos columns clean
10757            column_clean = column_format.get("column_clean", column_clean)
10758
10759            # Transcripts infos columns case
10760            column_case = column_format.get("column_case", column_case)
10761
10762            # Temporary View name
10763            temporary_view_name = transcripts_table + "".join(
10764                random.choices(string.ascii_uppercase + string.digits, k=10)
10765            )
10766
10767            # Create temporary view name
10768            temporary_view_name = self.annotation_format_to_table(
10769                uniquify=True,
10770                annotation_field=annotation_field,
10771                view_name=temporary_view_name,
10772                annotation_id=transcript_annotation,
10773                column_rename=column_rename,
10774                column_clean=column_clean,
10775                column_case=column_case,
10776            )
10777
10778            # Annotation fields
10779            if temporary_view_name:
10780                query_annotation_fields = f"""
10781                    SELECT *
10782                    FROM (
10783                        DESCRIBE SELECT *
10784                        FROM {temporary_view_name}
10785                        )
10786                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
10787                """
10788                df_annotation_fields = self.get_query_to_df(
10789                    query=query_annotation_fields
10790                )
10791
10792                # Add temporary view and annotation fields
10793                temporary_tables.append(temporary_view_name)
10794                annotation_fields += list(set(df_annotation_fields["column_name"]))
10795
10796        return temporary_tables, annotation_fields

The create_transcript_view_from_column_format function generates a transcript view based on specified column formats, adds additional columns and annotation fields, and returns the list of temporary tables and annotation fields.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table containing the transcripts data. This table will be used as the base table for creating the transcript view. The default value for this parameter is "transcripts", but you can provide a different table name if needed, defaults to transcripts
  • column_formats: The column_formats parameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary specifies the mapping between a transcripts column and a transcripts infos column. This parameter allows you to define how the columns from the transcripts table should be transformed or mapped
  • temporary_tables: The temporary_tables parameter in the create_transcript_view_from_column_format function is a list that stores the names of temporary views created during the process of creating a transcript view from a column format. These temporary views are used to manipulate and extract data before generating the final transcript view
  • annotation_fields: The annotation_fields parameter in the create_transcript_view_from_column_format function is a list that stores the annotation fields that are extracted from the temporary views created during the process. These annotation fields are obtained by querying the temporary views and extracting the column names excluding specific columns like `#CH
  • column_rename: The column_rename parameter in the create_transcript_view_from_column_format function is a dictionary that allows you to specify custom renaming of columns in the transcripts infos table. By providing a mapping of original column names to new column names in this dictionary, you can rename specific columns during the process
  • column_clean: The column_clean parameter in the create_transcript_view_from_column_format function is a boolean flag that determines whether the transcripts infos columns should undergo a cleaning process. If set to True, the columns will be cleaned during the creation of the transcript view based on the specified column format, defaults to False
  • column_case: The column_case parameter in the create_transcript_view_from_column_format function is used to specify the case transformation to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" to convert the column names to uppercase or lowercase, respectively
Returns

The create_transcript_view_from_column_format function returns two lists: temporary_tables and annotation_fields.

def create_transcript_view( self, transcripts_table: str = None, transcripts_table_drop: bool = False, param: dict = {}) -> str:
10798    def create_transcript_view(
10799        self,
10800        transcripts_table: str = None,
10801        transcripts_table_drop: bool = False,
10802        param: dict = {},
10803    ) -> str:
10804        """
10805        The `create_transcript_view` function generates a transcript view by processing data from a
10806        specified table based on provided parameters and structural information.
10807
10808        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
10809        is used to specify the name of the table that will store the final transcript view data. If a table
10810        name is not provided, the function will create a new table to store the transcript view data, and by
10811        default,, defaults to transcripts
10812        :type transcripts_table: str (optional)
10813        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
10814        `create_transcript_view` function is a boolean parameter that determines whether to drop the
10815        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
10816        the function will drop the existing transcripts table if it exists, defaults to False
10817        :type transcripts_table_drop: bool (optional)
10818        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
10819        contains information needed to create a transcript view. It includes details such as the structure
10820        of the transcripts, columns mapping, column formats, and other necessary information for generating
10821        the view. This parameter allows for flexibility and customization
10822        :type param: dict
10823        :return: The `create_transcript_view` function returns the name of the transcripts table that was
10824        created or modified during the execution of the function.
10825        """
10826
10827        log.debug("Start transcripts view creation...")
10828
10829        # Default
10830        transcripts_table_default = "transcripts"
10831
10832        # Param
10833        if not param:
10834            param = self.get_param()
10835
10836        # Struct
10837        struct = param.get("transcripts", {}).get("struct", None)
10838
10839        # Transcript veresion
10840        transcript_id_remove_version = param.get("transcripts", {}).get(
10841            "transcript_id_remove_version", False
10842        )
10843
10844        # Transcripts mapping
10845        transcript_id_mapping_file = param.get("transcripts", {}).get(
10846            "transcript_id_mapping_file", None
10847        )
10848
10849        # Transcripts mapping
10850        transcript_id_mapping_force = param.get("transcripts", {}).get(
10851            "transcript_id_mapping_force", None
10852        )
10853
10854        if struct:
10855
10856            # Transcripts table
10857            if transcripts_table is None:
10858                transcripts_table = param.get("transcripts", {}).get(
10859                    "table", transcripts_table_default
10860                )
10861
10862            # added_columns
10863            added_columns = []
10864
10865            # Temporary tables
10866            temporary_tables = []
10867
10868            # Annotation fields
10869            annotation_fields = []
10870
10871            # from columns map
10872            columns_maps = struct.get("from_columns_map", [])
10873            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10874                self.create_transcript_view_from_columns_map(
10875                    transcripts_table=transcripts_table,
10876                    columns_maps=columns_maps,
10877                    added_columns=added_columns,
10878                    temporary_tables=temporary_tables,
10879                    annotation_fields=annotation_fields,
10880                )
10881            )
10882            added_columns += added_columns_tmp
10883            temporary_tables += temporary_tables_tmp
10884            annotation_fields += annotation_fields_tmp
10885
10886            # from column format
10887            column_formats = struct.get("from_column_format", [])
10888            temporary_tables_tmp, annotation_fields_tmp = (
10889                self.create_transcript_view_from_column_format(
10890                    transcripts_table=transcripts_table,
10891                    column_formats=column_formats,
10892                    temporary_tables=temporary_tables,
10893                    annotation_fields=annotation_fields,
10894                )
10895            )
10896            temporary_tables += temporary_tables_tmp
10897            annotation_fields += annotation_fields_tmp
10898
10899            # Remove some specific fields/column
10900            annotation_fields = list(set(annotation_fields))
10901            for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]:
10902                if field in annotation_fields:
10903                    annotation_fields.remove(field)
10904
10905            # Merge temporary tables query
10906            query_merge = ""
10907            for temporary_table in list(set(temporary_tables)):
10908
10909                # First temporary table
10910                if not query_merge:
10911                    query_merge = f"""
10912                        SELECT * FROM {temporary_table}
10913                    """
10914                # other temporary table (using UNION)
10915                else:
10916                    query_merge += f"""
10917                        UNION BY NAME SELECT * FROM {temporary_table}
10918                    """
10919
10920            # transcript table tmp
10921            transcript_table_tmp = "transcripts_tmp"
10922            transcript_table_tmp2 = "transcripts_tmp2"
10923            transcript_table_tmp3 = "transcripts_tmp3"
10924
10925            # Merge on transcript
10926            query_merge_on_transcripts_annotation_fields = []
10927
10928            # Add transcript list
10929            query_merge_on_transcripts_annotation_fields.append(
10930                f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """
10931            )
10932
10933            # Aggregate all annotations fields
10934            for annotation_field in set(annotation_fields):
10935                query_merge_on_transcripts_annotation_fields.append(
10936                    f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """
10937                )
10938
10939            # Transcripts mapping
10940            if transcript_id_mapping_file:
10941
10942                # Transcript dataframe
10943                transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe"
10944                transcript_id_mapping_dataframe = transcripts_file_to_df(
10945                    transcript_id_mapping_file, column_names=["transcript", "alias"]
10946                )
10947
10948                # Transcript version remove
10949                if transcript_id_remove_version:
10950                    query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped"
10951                    query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)"
10952                    query_left_join = f"""
10953                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10954                    """
10955                else:
10956                    query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped"
10957                    query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript"
10958                    query_left_join = f"""
10959                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10960                    """
10961
10962                # Transcript column for group by merge
10963                query_transcript_merge_group_by = """
10964                        CASE
10965                            WHEN transcript_mapped NOT IN ('')
10966                            THEN split_part(transcript_mapped, '.', 1)
10967                            ELSE split_part(transcript_original, '.', 1)
10968                        END
10969                    """
10970
10971                # Merge query
10972                transcripts_tmp2_query = f"""
10973                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)}
10974                    FROM ({query_merge}) AS {transcript_table_tmp}
10975                    {query_left_join}
10976                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by}
10977                """
10978
10979                # Retrive columns after mege
10980                transcripts_tmp2_describe_query = f"""
10981                    DESCRIBE {transcripts_tmp2_query}
10982                """
10983                transcripts_tmp2_describe_list = list(
10984                    self.get_query_to_df(query=transcripts_tmp2_describe_query)[
10985                        "column_name"
10986                    ]
10987                )
10988
10989                # Create list of columns for select clause
10990                transcripts_tmp2_describe_select_clause = []
10991                for field in transcripts_tmp2_describe_list:
10992                    if field not in [
10993                        "#CHROM",
10994                        "POS",
10995                        "REF",
10996                        "ALT",
10997                        "INFO",
10998                        "transcript_mapped",
10999                    ]:
11000                        as_field = field
11001                        if field in ["transcript_original"]:
11002                            as_field = "transcripts_mapped"
11003                        transcripts_tmp2_describe_select_clause.append(
11004                            f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """
11005                        )
11006
11007                # Merge with mapping
11008                query_merge_on_transcripts = f"""
11009                    SELECT
11010                        "#CHROM", POS, REF, ALT, INFO,
11011                        CASE
11012                            WHEN ANY_VALUE(transcript_mapped) NOT IN ('')
11013                            THEN ANY_VALUE(transcript_mapped)
11014                            ELSE ANY_VALUE(transcript_original)
11015                        END AS transcript,
11016                        {", ".join(transcripts_tmp2_describe_select_clause)}
11017                    FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2}
11018                    GROUP BY "#CHROM", POS, REF, ALT, INFO,
11019                        {query_transcript_merge_group_by}
11020                """
11021
11022                # Add transcript filter from mapping file
11023                if transcript_id_mapping_force:
11024                    query_merge_on_transcripts = f"""
11025                        SELECT *
11026                        FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3}
11027                        WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe)
11028                    """
11029
11030            # No transcript mapping
11031            else:
11032
11033                # Remove transcript version
11034                if transcript_id_remove_version:
11035                    query_transcript_column = f"""
11036                        split_part({transcript_table_tmp}.transcript, '.', 1)
11037                    """
11038                else:
11039                    query_transcript_column = """
11040                        transcript
11041                    """
11042
11043                # Query sections
11044                query_transcript_column_select = (
11045                    f"{query_transcript_column} AS transcript"
11046                )
11047                query_transcript_column_group_by = query_transcript_column
11048
11049                # Query for transcripts view
11050                query_merge_on_transcripts = f"""
11051                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)}
11052                    FROM ({query_merge}) AS {transcript_table_tmp}
11053                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column}
11054                """
11055
11056            log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}")
11057
11058            # Drop transcript view is necessary
11059            if transcripts_table_drop:
11060                query_drop = f"""
11061                    DROP TABLE IF EXISTS {transcripts_table};
11062                """
11063                self.execute_query(query=query_drop)
11064
11065            # Merge and create transcript view
11066            query_create_view = f"""
11067                CREATE TABLE IF NOT EXISTS {transcripts_table}
11068                AS {query_merge_on_transcripts}
11069            """
11070            self.execute_query(query=query_create_view)
11071
11072            # Remove added columns
11073            for added_column in added_columns:
11074                self.drop_column(column=added_column)
11075
11076        else:
11077
11078            transcripts_table = None
11079
11080        return transcripts_table

The create_transcript_view function generates a transcript view by processing data from a specified table based on provided parameters and structural information.

Parameters
  • transcripts_table: The transcripts_table parameter in the create_transcript_view function is used to specify the name of the table that will store the final transcript view data. If a table name is not provided, the function will create a new table to store the transcript view data, and by default,, defaults to transcripts
  • transcripts_table_drop: The transcripts_table_drop parameter in the create_transcript_view function is a boolean parameter that determines whether to drop the existing transcripts table before creating a new one. If transcripts_table_drop is set to True, the function will drop the existing transcripts table if it exists, defaults to False
  • param: The param parameter in the create_transcript_view function is a dictionary that contains information needed to create a transcript view. It includes details such as the structure of the transcripts, columns mapping, column formats, and other necessary information for generating the view. This parameter allows for flexibility and customization
Returns

The create_transcript_view function returns the name of the transcripts table that was created or modified during the execution of the function.

def annotation_format_to_table( self, uniquify: bool = True, annotation_field: str = 'ANN', annotation_id: str = 'Feature_ID', view_name: str = 'transcripts', column_rename: dict = {}, column_clean: bool = False, column_case: str = None) -> str:
11082    def annotation_format_to_table(
11083        self,
11084        uniquify: bool = True,
11085        annotation_field: str = "ANN",
11086        annotation_id: str = "Feature_ID",
11087        view_name: str = "transcripts",
11088        column_rename: dict = {},
11089        column_clean: bool = False,
11090        column_case: str = None,
11091    ) -> str:
11092        """
11093        The `annotation_format_to_table` function converts annotation data from a VCF file into a
11094        structured table format, ensuring unique values and creating a temporary table for further
11095        processing or analysis.
11096
11097        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure
11098        unique values in the output or not. If set to `True`, the function will make sure that the
11099        output values are unique, defaults to True
11100        :type uniquify: bool (optional)
11101        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file
11102        that contains the annotation information for each variant. This field is used to extract the
11103        annotation details for further processing in the function. By default, it is set to "ANN",
11104        defaults to ANN
11105        :type annotation_field: str (optional)
11106        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method
11107        is used to specify the identifier for the annotation feature. This identifier will be used as a
11108        column name in the resulting table or view that is created based on the annotation data. It
11109        helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
11110        :type annotation_id: str (optional)
11111        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used
11112        to specify the name of the temporary table that will be created to store the transformed
11113        annotation data. This table will hold the extracted information from the annotation field in a
11114        structured format for further processing or analysis. By default,, defaults to transcripts
11115        :type view_name: str (optional)
11116        :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method
11117        is a dictionary that allows you to specify custom renaming for columns. By providing key-value
11118        pairs in this dictionary, you can rename specific columns in the resulting table or view that is
11119        created based on the annotation data. This feature enables
11120        :type column_rename: dict
11121        :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is
11122        a boolean flag that determines whether the annotation field should undergo a cleaning process.
11123        If set to `True`, the function will clean the annotation field before further processing. This
11124        cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults
11125        to False
11126        :type column_clean: bool (optional)
11127        :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is
11128        used to specify the case transformation to be applied to the column names extracted from the
11129        annotation data. It allows you to set the case of the column names to either lowercase or
11130        uppercase for consistency or other specific requirements during the conversion
11131        :type column_case: str
11132        :return: The function `annotation_format_to_table` is returning the name of the view created,
11133        which is stored in the variable `view_name`.
11134        """
11135
11136        # Annotation field
11137        annotation_format = "annotation_explode"
11138
11139        # Transcript annotation
11140        if column_rename:
11141            annotation_id = column_rename.get(annotation_id, annotation_id)
11142
11143        if column_clean:
11144            annotation_id = clean_annotation_field(annotation_id)
11145
11146        # Prefix
11147        prefix = self.get_explode_infos_prefix()
11148        if prefix:
11149            prefix = "INFO/"
11150
11151        # Annotation fields
11152        annotation_infos = prefix + annotation_field
11153        annotation_format_infos = prefix + annotation_format
11154
11155        # Variants table
11156        table_variants = self.get_table_variants()
11157
11158        # Header
11159        vcf_reader = self.get_header()
11160
11161        # Add columns
11162        added_columns = []
11163
11164        # Explode HGVS field in column
11165        added_columns += self.explode_infos(fields=[annotation_field])
11166
11167        if annotation_field in vcf_reader.infos:
11168
11169            # Extract ANN header
11170            ann_description = vcf_reader.infos[annotation_field].desc
11171            pattern = r"'(.+?)'"
11172            match = re.search(pattern, ann_description)
11173            if match:
11174                ann_header_match = match.group(1).split(" | ")
11175                ann_header = []
11176                ann_header_desc = {}
11177                for i in range(len(ann_header_match)):
11178                    ann_header_info = "".join(
11179                        char for char in ann_header_match[i] if char.isalnum()
11180                    )
11181                    ann_header.append(ann_header_info)
11182                    ann_header_desc[ann_header_info] = ann_header_match[i]
11183                if not ann_header_desc:
11184                    raise ValueError("Invalid header description format")
11185            else:
11186                raise ValueError("Invalid header description format")
11187
11188            # Create variant id
11189            variant_id_column = self.get_variant_id_column()
11190            added_columns += [variant_id_column]
11191
11192            # Create dataframe
11193            dataframe_annotation_format = self.get_query_to_df(
11194                f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
11195            )
11196
11197            # Create annotation columns
11198            dataframe_annotation_format[
11199                annotation_format_infos
11200            ] = dataframe_annotation_format[annotation_infos].apply(
11201                lambda x: explode_annotation_format(
11202                    annotation=str(x),
11203                    uniquify=uniquify,
11204                    output_format="JSON",
11205                    prefix="",
11206                    header=list(ann_header_desc.values()),
11207                )
11208            )
11209
11210            # Find keys
11211            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
11212            df_keys = self.get_query_to_df(query=query_json)
11213
11214            # Check keys
11215            query_json_key = []
11216            for _, row in df_keys.iterrows():
11217
11218                # Key
11219                key = row.iloc[0]
11220                key_clean = key
11221
11222                # key rename
11223                if column_rename:
11224                    key_clean = column_rename.get(key_clean, key_clean)
11225
11226                # key clean
11227                if column_clean:
11228                    key_clean = clean_annotation_field(key_clean)
11229
11230                # Key case
11231                if column_case:
11232                    if column_case.lower() in ["lower"]:
11233                        key_clean = key_clean.lower()
11234                    elif column_case.lower() in ["upper"]:
11235                        key_clean = key_clean.upper()
11236
11237                # Type
11238                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
11239
11240                # Get DataFrame from query
11241                df_json_type = self.get_query_to_df(query=query_json_type)
11242
11243                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
11244                with pd.option_context("future.no_silent_downcasting", True):
11245                    df_json_type.fillna(value="", inplace=True)
11246                    replace_dict = {None: np.nan, "": np.nan}
11247                    df_json_type.replace(replace_dict, inplace=True)
11248                    df_json_type.dropna(inplace=True)
11249
11250                # Detect column type
11251                column_type = detect_column_type(df_json_type[key_clean])
11252
11253                # Append
11254                query_json_key.append(
11255                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
11256                )
11257
11258            # Create view
11259            query_view = f"""
11260                CREATE TEMPORARY TABLE {view_name}
11261                AS (
11262                    SELECT *, {annotation_id} AS 'transcript'
11263                    FROM (
11264                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
11265                        FROM dataframe_annotation_format
11266                        )
11267                    );
11268            """
11269            self.execute_query(query=query_view)
11270
11271        else:
11272
11273            # Return None
11274            view_name = None
11275
11276        # Remove added columns
11277        for added_column in added_columns:
11278            self.drop_column(column=added_column)
11279
11280        return view_name

The annotation_format_to_table function converts annotation data from a VCF file into a structured table format, ensuring unique values and creating a temporary table for further processing or analysis.

Parameters
  • uniquify: The uniquify parameter is a boolean flag that determines whether to ensure unique values in the output or not. If set to True, the function will make sure that the output values are unique, defaults to True
  • annotation_field: The annotation_field parameter refers to the field in the VCF file that contains the annotation information for each variant. This field is used to extract the annotation details for further processing in the function. By default, it is set to "ANN", defaults to ANN
  • annotation_id: The annotation_id parameter in the annotation_format_to_table method is used to specify the identifier for the annotation feature. This identifier will be used as a column name in the resulting table or view that is created based on the annotation data. It helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
  • view_name: The view_name parameter in the annotation_format_to_table method is used to specify the name of the temporary table that will be created to store the transformed annotation data. This table will hold the extracted information from the annotation field in a structured format for further processing or analysis. By default,, defaults to transcripts
  • column_rename: The column_rename parameter in the annotation_format_to_table method is a dictionary that allows you to specify custom renaming for columns. By providing key-value pairs in this dictionary, you can rename specific columns in the resulting table or view that is created based on the annotation data. This feature enables
  • column_clean: The column_clean parameter in the annotation_format_to_table method is a boolean flag that determines whether the annotation field should undergo a cleaning process. If set to True, the function will clean the annotation field before further processing. This cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults to False
  • column_case: The column_case parameter in the annotation_format_to_table method is used to specify the case transformation to be applied to the column names extracted from the annotation data. It allows you to set the case of the column names to either lowercase or uppercase for consistency or other specific requirements during the conversion
Returns

The function annotation_format_to_table is returning the name of the view created, which is stored in the variable view_name.

def transcript_view_to_variants( self, transcripts_table: str = None, transcripts_column_id: str = None, transcripts_info_json: str = None, transcripts_info_field_json: str = None, transcripts_info_format: str = None, transcripts_info_field_format: str = None, param: dict = {}) -> bool:
11282    def transcript_view_to_variants(
11283        self,
11284        transcripts_table: str = None,
11285        transcripts_column_id: str = None,
11286        transcripts_info_json: str = None,
11287        transcripts_info_field_json: str = None,
11288        transcripts_info_format: str = None,
11289        transcripts_info_field_format: str = None,
11290        param: dict = {},
11291    ) -> bool:
11292        """
11293        The `transcript_view_to_variants` function updates a variants table with information from
11294        transcripts in JSON format.
11295
11296        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
11297        table containing the transcripts data. If this parameter is not provided, the function will
11298        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
11299        :type transcripts_table: str
11300        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
11301        column in the `transcripts_table` that contains the unique identifier for each transcript. This
11302        identifier is used to match transcripts with variants in the database
11303        :type transcripts_column_id: str
11304        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
11305        of the column in the variants table where the transcripts information will be stored in JSON
11306        format. This parameter allows you to define the column in the variants table that will hold the
11307        JSON-formatted information about transcripts
11308        :type transcripts_info_json: str
11309        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
11310        specify the field in the VCF header that will contain information about transcripts in JSON
11311        format. This field will be added to the VCF header as an INFO field with the specified name
11312        :type transcripts_info_field_json: str
11313        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
11314        format of the information about transcripts that will be stored in the variants table. This
11315        format can be used to define how the transcript information will be structured or displayed
11316        within the variants table
11317        :type transcripts_info_format: str
11318        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
11319        specify the field in the VCF header that will contain information about transcripts in a
11320        specific format. This field will be added to the VCF header as an INFO field with the specified
11321        name
11322        :type transcripts_info_field_format: str
11323        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
11324        that contains various configuration settings related to transcripts. It is used to provide
11325        default values for certain parameters if they are not explicitly provided when calling the
11326        method. The `param` dictionary can be passed as an argument
11327        :type param: dict
11328        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
11329        if the operation is successful and `False` if certain conditions are not met.
11330        """
11331
11332        msg_info_prefix = "Start transcripts view to variants annotations"
11333
11334        log.debug(f"{msg_info_prefix}...")
11335
11336        # Default
11337        transcripts_table_default = "transcripts"
11338        transcripts_column_id_default = "transcript"
11339        transcripts_info_json_default = None
11340        transcripts_info_format_default = None
11341        transcripts_info_field_json_default = None
11342        transcripts_info_field_format_default = None
11343
11344        # Param
11345        if not param:
11346            param = self.get_param()
11347
11348        # Transcripts table
11349        if transcripts_table is None:
11350            transcripts_table = param.get("transcripts", {}).get(
11351                "table", transcripts_table_default
11352            )
11353
11354        # Transcripts column ID
11355        if transcripts_column_id is None:
11356            transcripts_column_id = param.get("transcripts", {}).get(
11357                "column_id", transcripts_column_id_default
11358            )
11359
11360        # Transcripts info json
11361        if transcripts_info_json is None:
11362            transcripts_info_json = param.get("transcripts", {}).get(
11363                "transcripts_info_json", transcripts_info_json_default
11364            )
11365
11366        # Transcripts info field JSON
11367        if transcripts_info_field_json is None:
11368            transcripts_info_field_json = param.get("transcripts", {}).get(
11369                "transcripts_info_field_json", transcripts_info_field_json_default
11370            )
11371        # if transcripts_info_field_json is not None and transcripts_info_json is None:
11372        #     transcripts_info_json = transcripts_info_field_json
11373
11374        # Transcripts info format
11375        if transcripts_info_format is None:
11376            transcripts_info_format = param.get("transcripts", {}).get(
11377                "transcripts_info_format", transcripts_info_format_default
11378            )
11379
11380        # Transcripts info field FORMAT
11381        if transcripts_info_field_format is None:
11382            transcripts_info_field_format = param.get("transcripts", {}).get(
11383                "transcripts_info_field_format", transcripts_info_field_format_default
11384            )
11385        # if (
11386        #     transcripts_info_field_format is not None
11387        #     and transcripts_info_format is None
11388        # ):
11389        #     transcripts_info_format = transcripts_info_field_format
11390
11391        # Variants table
11392        table_variants = self.get_table_variants()
11393
11394        # Check info columns param
11395        if (
11396            transcripts_info_json is None
11397            and transcripts_info_field_json is None
11398            and transcripts_info_format is None
11399            and transcripts_info_field_format is None
11400        ):
11401            return False
11402
11403        # Transcripts infos columns
11404        query_transcripts_infos_columns = f"""
11405            SELECT *
11406            FROM (
11407                DESCRIBE SELECT * FROM {transcripts_table}
11408                )
11409            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
11410        """
11411        transcripts_infos_columns = list(
11412            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
11413        )
11414
11415        # View results
11416        clause_select = []
11417        clause_to_json = []
11418        clause_to_format = []
11419        for field in transcripts_infos_columns:
11420            # Do not consider INFO field for export into fields
11421            if field not in ["INFO"]:
11422                clause_select.append(
11423                    f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """
11424                )
11425                clause_to_json.append(f""" '{field}': "{field}" """)
11426                clause_to_format.append(f""" "{field}" """)
11427
11428        # Update
11429        update_set_json = []
11430        update_set_format = []
11431
11432        # VCF header
11433        vcf_reader = self.get_header()
11434
11435        # Transcripts to info column in JSON
11436        if transcripts_info_json:
11437
11438            # Create column on variants table
11439            self.add_column(
11440                table_name=table_variants,
11441                column_name=transcripts_info_json,
11442                column_type="JSON",
11443                default_value=None,
11444                drop=False,
11445            )
11446
11447            # Add header
11448            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
11449                transcripts_info_json,
11450                ".",
11451                "String",
11452                "Transcripts in JSON format",
11453                "unknwon",
11454                "unknwon",
11455                self.code_type_map["String"],
11456            )
11457
11458            # Add to update
11459            update_set_json.append(
11460                f""" {transcripts_info_json}=t.{transcripts_info_json} """
11461            )
11462
11463        # Transcripts to info field in JSON
11464        if transcripts_info_field_json:
11465
11466            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
11467
11468            # Add to update
11469            update_set_json.append(
11470                f""" 
11471                    INFO = concat(
11472                            CASE
11473                                WHEN INFO NOT IN ('', '.')
11474                                THEN INFO
11475                                ELSE ''
11476                            END,
11477                            CASE
11478                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
11479                                THEN concat(
11480                                    ';{transcripts_info_field_json}=',
11481                                    t.{transcripts_info_json}
11482                                )
11483                                ELSE ''
11484                            END
11485                            )
11486                """
11487            )
11488
11489            # Add header
11490            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
11491                transcripts_info_field_json,
11492                ".",
11493                "String",
11494                "Transcripts in JSON format",
11495                "unknwon",
11496                "unknwon",
11497                self.code_type_map["String"],
11498            )
11499
11500        if update_set_json:
11501
11502            # Update query
11503            query_update = f"""
11504                UPDATE {table_variants}
11505                    SET {", ".join(update_set_json)}
11506                FROM
11507                (
11508                    SELECT
11509                        "#CHROM", POS, REF, ALT,
11510                            concat(
11511                            '{{',
11512                            string_agg(
11513                                '"' || "{transcripts_column_id}" || '":' ||
11514                                to_json(json_output)
11515                            ),
11516                            '}}'
11517                            )::JSON AS {transcripts_info_json}
11518                    FROM
11519                        (
11520                        SELECT
11521                            "#CHROM", POS, REF, ALT,
11522                            "{transcripts_column_id}",
11523                            to_json(
11524                                {{{",".join(clause_to_json)}}}
11525                            )::JSON AS json_output
11526                        FROM
11527                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11528                        WHERE "{transcripts_column_id}" IS NOT NULL
11529                        )
11530                    GROUP BY "#CHROM", POS, REF, ALT
11531                ) AS t
11532                WHERE {table_variants}."#CHROM" = t."#CHROM"
11533                    AND {table_variants}."POS" = t."POS"
11534                    AND {table_variants}."REF" = t."REF"
11535                    AND {table_variants}."ALT" = t."ALT"
11536            """
11537
11538            self.execute_query(query=query_update)
11539
11540        # Transcripts to info column in FORMAT
11541        if transcripts_info_format:
11542
11543            # Create column on variants table
11544            self.add_column(
11545                table_name=table_variants,
11546                column_name=transcripts_info_format,
11547                column_type="VARCHAR",
11548                default_value=None,
11549                drop=False,
11550            )
11551
11552            # Add header
11553            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
11554                transcripts_info_format,
11555                ".",
11556                "String",
11557                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11558                "unknwon",
11559                "unknwon",
11560                self.code_type_map["String"],
11561            )
11562
11563            # Add to update
11564            update_set_format.append(
11565                f""" {transcripts_info_format}=t.{transcripts_info_format} """
11566            )
11567
11568        else:
11569
11570            # Set variable for internal queries
11571            transcripts_info_format = "transcripts_info_format"
11572
11573        # Transcripts to info field in JSON
11574        if transcripts_info_field_format:
11575
11576            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
11577
11578            # Add to update
11579            update_set_format.append(
11580                f""" 
11581                    INFO = concat(
11582                            CASE
11583                                WHEN INFO NOT IN ('', '.')
11584                                THEN INFO
11585                                ELSE ''
11586                            END,
11587                            CASE
11588                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
11589                                THEN concat(
11590                                    ';{transcripts_info_field_format}=',
11591                                    t.{transcripts_info_format}
11592                                )
11593                                ELSE ''
11594                            END
11595                            )
11596                """
11597            )
11598
11599            # Add header
11600            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
11601                transcripts_info_field_format,
11602                ".",
11603                "String",
11604                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11605                "unknwon",
11606                "unknwon",
11607                self.code_type_map["String"],
11608            )
11609
11610        if update_set_format:
11611
11612            # Update query
11613            query_update = f"""
11614                UPDATE {table_variants}
11615                    SET {", ".join(update_set_format)}
11616                FROM
11617                (
11618                    SELECT
11619                        "#CHROM", POS, REF, ALT,
11620                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
11621                    FROM 
11622                        (
11623                        SELECT
11624                            "#CHROM", POS, REF, ALT,
11625                            "{transcripts_column_id}",
11626                            concat(
11627                                "{transcripts_column_id}",
11628                                '|',
11629                                {", '|', ".join(clause_to_format)}
11630                            ) AS {transcripts_info_format}
11631                        FROM
11632                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11633                        )
11634                    GROUP BY "#CHROM", POS, REF, ALT
11635                ) AS t
11636                WHERE {table_variants}."#CHROM" = t."#CHROM"
11637                    AND {table_variants}."POS" = t."POS"
11638                    AND {table_variants}."REF" = t."REF"
11639                    AND {table_variants}."ALT" = t."ALT"
11640            """
11641
11642            self.execute_query(query=query_update)
11643
11644        return True

The transcript_view_to_variants function updates a variants table with information from transcripts in JSON format.

Parameters
  • transcripts_table: The transcripts_table parameter is used to specify the name of the table containing the transcripts data. If this parameter is not provided, the function will attempt to retrieve it from the param dictionary or use a default value of "transcripts"
  • transcripts_column_id: The transcripts_column_id parameter is used to specify the column in the transcripts_table that contains the unique identifier for each transcript. This identifier is used to match transcripts with variants in the database
  • transcripts_info_json: The transcripts_info_json parameter is used to specify the name of the column in the variants table where the transcripts information will be stored in JSON format. This parameter allows you to define the column in the variants table that will hold the JSON-formatted information about transcripts
  • transcripts_info_field_json: The transcripts_info_field_json parameter is used to specify the field in the VCF header that will contain information about transcripts in JSON format. This field will be added to the VCF header as an INFO field with the specified name
  • transcripts_info_format: The transcripts_info_format parameter is used to specify the format of the information about transcripts that will be stored in the variants table. This format can be used to define how the transcript information will be structured or displayed within the variants table
  • transcripts_info_field_format: The transcripts_info_field_format parameter is used to specify the field in the VCF header that will contain information about transcripts in a specific format. This field will be added to the VCF header as an INFO field with the specified name
  • param: The param parameter in the transcript_view_to_variants method is a dictionary that contains various configuration settings related to transcripts. It is used to provide default values for certain parameters if they are not explicitly provided when calling the method. The param dictionary can be passed as an argument
Returns

The function transcript_view_to_variants returns a boolean value. It returns True if the operation is successful and False if certain conditions are not met.

def rename_info_fields(self, fields_to_rename: dict = None, table: str = None) -> dict:
11646    def rename_info_fields(
11647        self, fields_to_rename: dict = None, table: str = None
11648    ) -> dict:
11649        """
11650        The `rename_info_fields` function renames specified fields in a VCF file header and updates
11651        corresponding INFO fields in the variants table.
11652
11653        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the
11654        mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary
11655        represent the original field names that need to be renamed, and the corresponding values
11656        represent the new names to which the fields should be
11657        :type fields_to_rename: dict
11658        :param table: The `table` parameter in the `rename_info_fields` function represents the name of
11659        the table in which the variants data is stored. This table contains information about genetic
11660        variants, and the function updates the corresponding INFO fields in this table when renaming
11661        specified fields in the VCF file header
11662        :type table: str
11663        :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains
11664        the original field names as keys and their corresponding new names (or None if the field was
11665        removed) as values after renaming or removing specified fields in a VCF file header and updating
11666        corresponding INFO fields in the variants table.
11667        """
11668
11669        # Init
11670        fields_renamed = {}
11671        config = self.get_config()
11672        access = config.get("access")
11673
11674        if table is None:
11675            table = self.get_table_variants()
11676
11677        if fields_to_rename is not None and access not in ["RO"]:
11678
11679            log.info("Rename or remove fields...")
11680
11681            # Header
11682            header = self.get_header()
11683
11684            for field_to_rename, field_renamed in fields_to_rename.items():
11685
11686                if field_to_rename in header.infos:
11687
11688                    # Rename header
11689                    if field_renamed is not None:
11690                        header.infos[field_renamed] = vcf.parser._Info(
11691                            field_renamed,
11692                            header.infos[field_to_rename].num,
11693                            header.infos[field_to_rename].type,
11694                            header.infos[field_to_rename].desc,
11695                            header.infos[field_to_rename].source,
11696                            header.infos[field_to_rename].version,
11697                            header.infos[field_to_rename].type_code,
11698                        )
11699                    del header.infos[field_to_rename]
11700
11701                    # Rename INFO patterns
11702                    field_pattern = rf'(^|;)({field_to_rename})=([^;]*)'
11703                    if field_renamed is not None:
11704                        field_renamed_pattern = rf'\1{field_renamed}=\3'
11705                    else:
11706                        field_renamed_pattern = ''
11707
11708                    # Rename INFO
11709                    query = f"""
11710                        UPDATE {table}
11711                        SET
11712                            INFO = regexp_replace(INFO, '{field_pattern}', '{field_renamed_pattern}', 'g')
11713                    """
11714                    self.execute_query(query=query)
11715
11716                    # Return
11717                    fields_renamed[field_to_rename] = field_renamed
11718
11719                    # Log
11720                    if field_renamed is not None:
11721                        log.info(f"Rename or remove fields: field '{field_to_rename}' renamed to '{field_renamed}'")
11722                    else:
11723                        log.info(f"Rename or remove fields: field '{field_to_rename}' removed")
11724
11725        return fields_renamed

The rename_info_fields function renames specified fields in a VCF file header and updates corresponding INFO fields in the variants table.

Parameters
  • fields_to_rename: The fields_to_rename parameter is a dictionary that contains the mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary represent the original field names that need to be renamed, and the corresponding values represent the new names to which the fields should be
  • table: The table parameter in the rename_info_fields function represents the name of the table in which the variants data is stored. This table contains information about genetic variants, and the function updates the corresponding INFO fields in this table when renaming specified fields in the VCF file header
Returns

The rename_info_fields function returns a dictionary fields_renamed that contains the original field names as keys and their corresponding new names (or None if the field was removed) as values after renaming or removing specified fields in a VCF file header and updating corresponding INFO fields in the variants table.

def calculation_rename_info_fields( self, fields_to_rename: dict = None, table: str = None, operation_name: str = 'RENAME_INFO_FIELDS') -> None:
11727    def calculation_rename_info_fields(
11728        self,
11729        fields_to_rename: dict = None,
11730        table: str = None,
11731        operation_name: str = "RENAME_INFO_FIELDS",
11732    ) -> None:
11733        """
11734        The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates
11735        fields to rename and table if provided, and then calls another function to rename the fields.
11736
11737        :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be
11738        renamed in a table. Each key-value pair in the dictionary represents the original field name as
11739        the key and the new field name as the value
11740        :type fields_to_rename: dict
11741        :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to
11742        specify the name of the table for which the fields are to be renamed. It is a string type
11743        parameter
11744        :type table: str
11745        :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields`
11746        method is a string that specifies the name of the operation being performed. In this context, it
11747        is used as a default value for the operation name if not explicitly provided when calling the
11748        function, defaults to RENAME_INFO_FIELDS
11749        :type operation_name: str (optional)
11750        """
11751
11752        # Param
11753        param = self.get_param()
11754
11755        # Get param fields to rename
11756        param_fields_to_rename = (
11757            param.get("calculation", {})
11758            .get("calculations", {})
11759            .get(operation_name, {})
11760            .get("fields_to_rename", None)
11761        )
11762
11763        # Get param table
11764        param_table = (
11765            param.get("calculation", {})
11766            .get("calculations", {})
11767            .get(operation_name, {})
11768            .get("table", None)
11769        )
11770
11771        # Init fields_to_rename
11772        if fields_to_rename is None:
11773            fields_to_rename = param_fields_to_rename
11774
11775        # Init table
11776        if table is None:
11777            table = param_table
11778
11779        renamed_fields = self.rename_info_fields(
11780            fields_to_rename=fields_to_rename, table=table
11781        )
11782
11783        log.debug(f"renamed_fields:{renamed_fields}")

The calculation_rename_info_fields function retrieves parameters from a dictionary, updates fields to rename and table if provided, and then calls another function to rename the fields.

Parameters
  • fields_to_rename: fields_to_rename is a dictionary that contains the fields to be renamed in a table. Each key-value pair in the dictionary represents the original field name as the key and the new field name as the value
  • table: The table parameter in the calculation_rename_info_fields method is used to specify the name of the table for which the fields are to be renamed. It is a string type parameter
  • operation_name: The operation_name parameter in the calculation_rename_info_fields method is a string that specifies the name of the operation being performed. In this context, it is used as a default value for the operation name if not explicitly provided when calling the function, defaults to RENAME_INFO_FIELDS